diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7951 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9696969696969697, + "eval_steps": 141, + "global_step": 1122, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017825311942959, + "grad_norm": 13.759645462036133, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.5522, + "step": 1 + }, + { + "epoch": 0.0017825311942959, + "eval_loss": 1.6059316396713257, + "eval_runtime": 329.95, + "eval_samples_per_second": 8.917, + "eval_steps_per_second": 1.115, + "step": 1 + }, + { + "epoch": 0.0035650623885918, + "grad_norm": 14.121225357055664, + "learning_rate": 4.000000000000001e-06, + "loss": 1.6595, + "step": 2 + }, + { + "epoch": 0.0053475935828877, + "grad_norm": 11.482634544372559, + "learning_rate": 6e-06, + "loss": 1.6028, + "step": 3 + }, + { + "epoch": 0.0071301247771836, + "grad_norm": 9.531785011291504, + "learning_rate": 8.000000000000001e-06, + "loss": 1.4464, + "step": 4 + }, + { + "epoch": 0.008912655971479501, + "grad_norm": 9.846260070800781, + "learning_rate": 1e-05, + "loss": 1.3576, + "step": 5 + }, + { + "epoch": 0.0106951871657754, + "grad_norm": 7.445741653442383, + "learning_rate": 1.2e-05, + "loss": 1.2678, + "step": 6 + }, + { + "epoch": 0.012477718360071301, + "grad_norm": 5.630918979644775, + "learning_rate": 1.4e-05, + "loss": 1.299, + "step": 7 + }, + { + "epoch": 0.0142602495543672, + "grad_norm": 6.509588241577148, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.2572, + "step": 8 + }, + { + "epoch": 0.016042780748663103, + "grad_norm": 6.023720741271973, + "learning_rate": 1.8e-05, + "loss": 1.2671, + "step": 9 + }, + { + "epoch": 0.017825311942959002, + "grad_norm": 4.982049465179443, + "learning_rate": 2e-05, + "loss": 1.1808, + "step": 10 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 4.719905853271484, + "learning_rate": 1.9999960092007093e-05, + "loss": 1.2005, + "step": 11 + }, + { + "epoch": 0.0213903743315508, + "grad_norm": 4.139680862426758, + "learning_rate": 1.9999840368346898e-05, + "loss": 1.1395, + "step": 12 + }, + { + "epoch": 0.023172905525846704, + "grad_norm": 3.6177799701690674, + "learning_rate": 1.9999640829975005e-05, + "loss": 1.2353, + "step": 13 + }, + { + "epoch": 0.024955436720142603, + "grad_norm": 3.3829057216644287, + "learning_rate": 1.9999361478484043e-05, + "loss": 1.1544, + "step": 14 + }, + { + "epoch": 0.026737967914438502, + "grad_norm": 3.5621323585510254, + "learning_rate": 1.9999002316103692e-05, + "loss": 1.2219, + "step": 15 + }, + { + "epoch": 0.0285204991087344, + "grad_norm": 3.8984572887420654, + "learning_rate": 1.9998563345700635e-05, + "loss": 1.131, + "step": 16 + }, + { + "epoch": 0.030303030303030304, + "grad_norm": 3.623622417449951, + "learning_rate": 1.999804457077856e-05, + "loss": 1.174, + "step": 17 + }, + { + "epoch": 0.03208556149732621, + "grad_norm": 3.9878575801849365, + "learning_rate": 1.999744599547812e-05, + "loss": 1.206, + "step": 18 + }, + { + "epoch": 0.0338680926916221, + "grad_norm": 3.3543357849121094, + "learning_rate": 1.9996767624576902e-05, + "loss": 1.1469, + "step": 19 + }, + { + "epoch": 0.035650623885918005, + "grad_norm": 3.7037160396575928, + "learning_rate": 1.9996009463489393e-05, + "loss": 1.1786, + "step": 20 + }, + { + "epoch": 0.0374331550802139, + "grad_norm": 3.6356801986694336, + "learning_rate": 1.9995171518266926e-05, + "loss": 1.1448, + "step": 21 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 3.112527370452881, + "learning_rate": 1.999425379559765e-05, + "loss": 1.137, + "step": 22 + }, + { + "epoch": 0.040998217468805706, + "grad_norm": 3.8700811862945557, + "learning_rate": 1.9993256302806452e-05, + "loss": 1.1646, + "step": 23 + }, + { + "epoch": 0.0427807486631016, + "grad_norm": 4.020349502563477, + "learning_rate": 1.9992179047854923e-05, + "loss": 1.1245, + "step": 24 + }, + { + "epoch": 0.044563279857397504, + "grad_norm": 3.30739426612854, + "learning_rate": 1.9991022039341278e-05, + "loss": 1.1164, + "step": 25 + }, + { + "epoch": 0.04634581105169341, + "grad_norm": 3.206475019454956, + "learning_rate": 1.9989785286500294e-05, + "loss": 1.2038, + "step": 26 + }, + { + "epoch": 0.0481283422459893, + "grad_norm": 3.0726990699768066, + "learning_rate": 1.998846879920324e-05, + "loss": 1.09, + "step": 27 + }, + { + "epoch": 0.049910873440285206, + "grad_norm": 3.2845897674560547, + "learning_rate": 1.9987072587957784e-05, + "loss": 1.143, + "step": 28 + }, + { + "epoch": 0.05169340463458111, + "grad_norm": 3.3358678817749023, + "learning_rate": 1.9985596663907924e-05, + "loss": 1.1128, + "step": 29 + }, + { + "epoch": 0.053475935828877004, + "grad_norm": 3.362863063812256, + "learning_rate": 1.99840410388339e-05, + "loss": 1.2224, + "step": 30 + }, + { + "epoch": 0.05525846702317291, + "grad_norm": 3.2610557079315186, + "learning_rate": 1.9982405725152073e-05, + "loss": 1.039, + "step": 31 + }, + { + "epoch": 0.0570409982174688, + "grad_norm": 2.9130852222442627, + "learning_rate": 1.998069073591488e-05, + "loss": 1.0769, + "step": 32 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 3.179290533065796, + "learning_rate": 1.9978896084810656e-05, + "loss": 1.1426, + "step": 33 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 2.628631591796875, + "learning_rate": 1.99770217861636e-05, + "loss": 1.0908, + "step": 34 + }, + { + "epoch": 0.062388591800356503, + "grad_norm": 3.04935884475708, + "learning_rate": 1.9975067854933607e-05, + "loss": 1.0795, + "step": 35 + }, + { + "epoch": 0.06417112299465241, + "grad_norm": 3.455918312072754, + "learning_rate": 1.9973034306716173e-05, + "loss": 1.1744, + "step": 36 + }, + { + "epoch": 0.0659536541889483, + "grad_norm": 3.023421049118042, + "learning_rate": 1.997092115774226e-05, + "loss": 1.1438, + "step": 37 + }, + { + "epoch": 0.0677361853832442, + "grad_norm": 2.92010760307312, + "learning_rate": 1.9968728424878178e-05, + "loss": 1.0627, + "step": 38 + }, + { + "epoch": 0.06951871657754011, + "grad_norm": 2.4813754558563232, + "learning_rate": 1.996645612562544e-05, + "loss": 1.1508, + "step": 39 + }, + { + "epoch": 0.07130124777183601, + "grad_norm": 4.026529788970947, + "learning_rate": 1.9964104278120624e-05, + "loss": 1.078, + "step": 40 + }, + { + "epoch": 0.07308377896613191, + "grad_norm": 3.1887829303741455, + "learning_rate": 1.9961672901135238e-05, + "loss": 1.0675, + "step": 41 + }, + { + "epoch": 0.0748663101604278, + "grad_norm": 3.5456881523132324, + "learning_rate": 1.9959162014075553e-05, + "loss": 0.993, + "step": 42 + }, + { + "epoch": 0.0766488413547237, + "grad_norm": 2.937511444091797, + "learning_rate": 1.9956571636982463e-05, + "loss": 1.1662, + "step": 43 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 2.6722967624664307, + "learning_rate": 1.9953901790531315e-05, + "loss": 1.0205, + "step": 44 + }, + { + "epoch": 0.08021390374331551, + "grad_norm": 2.8215572834014893, + "learning_rate": 1.9951152496031755e-05, + "loss": 1.066, + "step": 45 + }, + { + "epoch": 0.08199643493761141, + "grad_norm": 3.3156650066375732, + "learning_rate": 1.994832377542755e-05, + "loss": 1.1836, + "step": 46 + }, + { + "epoch": 0.08377896613190731, + "grad_norm": 2.6018829345703125, + "learning_rate": 1.9945415651296408e-05, + "loss": 1.1132, + "step": 47 + }, + { + "epoch": 0.0855614973262032, + "grad_norm": 2.6550025939941406, + "learning_rate": 1.994242814684981e-05, + "loss": 1.1259, + "step": 48 + }, + { + "epoch": 0.0873440285204991, + "grad_norm": 3.2128894329071045, + "learning_rate": 1.9939361285932818e-05, + "loss": 1.1379, + "step": 49 + }, + { + "epoch": 0.08912655971479501, + "grad_norm": 2.9681684970855713, + "learning_rate": 1.9936215093023884e-05, + "loss": 1.1592, + "step": 50 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 2.9430744647979736, + "learning_rate": 1.9932989593234656e-05, + "loss": 1.0492, + "step": 51 + }, + { + "epoch": 0.09269162210338681, + "grad_norm": 2.971011161804199, + "learning_rate": 1.9929684812309783e-05, + "loss": 1.1518, + "step": 52 + }, + { + "epoch": 0.0944741532976827, + "grad_norm": 3.132988691329956, + "learning_rate": 1.9926300776626694e-05, + "loss": 0.9236, + "step": 53 + }, + { + "epoch": 0.0962566844919786, + "grad_norm": 2.9208507537841797, + "learning_rate": 1.9922837513195406e-05, + "loss": 1.1269, + "step": 54 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 3.3337550163269043, + "learning_rate": 1.9919295049658296e-05, + "loss": 1.124, + "step": 55 + }, + { + "epoch": 0.09982174688057041, + "grad_norm": 3.489339828491211, + "learning_rate": 1.9915673414289885e-05, + "loss": 1.1692, + "step": 56 + }, + { + "epoch": 0.10160427807486631, + "grad_norm": 3.233980417251587, + "learning_rate": 1.991197263599662e-05, + "loss": 1.0572, + "step": 57 + }, + { + "epoch": 0.10338680926916222, + "grad_norm": 3.279974937438965, + "learning_rate": 1.990819274431662e-05, + "loss": 1.0731, + "step": 58 + }, + { + "epoch": 0.1051693404634581, + "grad_norm": 3.0058257579803467, + "learning_rate": 1.990433376941947e-05, + "loss": 1.0906, + "step": 59 + }, + { + "epoch": 0.10695187165775401, + "grad_norm": 2.8452155590057373, + "learning_rate": 1.9900395742105948e-05, + "loss": 1.0852, + "step": 60 + }, + { + "epoch": 0.10873440285204991, + "grad_norm": 2.801748037338257, + "learning_rate": 1.989637869380782e-05, + "loss": 1.0664, + "step": 61 + }, + { + "epoch": 0.11051693404634581, + "grad_norm": 2.682849168777466, + "learning_rate": 1.989228265658754e-05, + "loss": 1.1374, + "step": 62 + }, + { + "epoch": 0.11229946524064172, + "grad_norm": 2.923135995864868, + "learning_rate": 1.988810766313804e-05, + "loss": 1.0354, + "step": 63 + }, + { + "epoch": 0.1140819964349376, + "grad_norm": 2.826714277267456, + "learning_rate": 1.9883853746782447e-05, + "loss": 1.0734, + "step": 64 + }, + { + "epoch": 0.11586452762923351, + "grad_norm": 2.7044596672058105, + "learning_rate": 1.9879520941473804e-05, + "loss": 1.0588, + "step": 65 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 2.67295503616333, + "learning_rate": 1.9875109281794828e-05, + "loss": 1.1014, + "step": 66 + }, + { + "epoch": 0.11942959001782531, + "grad_norm": 2.5908753871917725, + "learning_rate": 1.9870618802957617e-05, + "loss": 1.1196, + "step": 67 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 2.461550712585449, + "learning_rate": 1.9866049540803364e-05, + "loss": 1.0968, + "step": 68 + }, + { + "epoch": 0.12299465240641712, + "grad_norm": 2.6429128646850586, + "learning_rate": 1.9861401531802093e-05, + "loss": 1.0237, + "step": 69 + }, + { + "epoch": 0.12477718360071301, + "grad_norm": 2.7659785747528076, + "learning_rate": 1.9856674813052345e-05, + "loss": 1.086, + "step": 70 + }, + { + "epoch": 0.1265597147950089, + "grad_norm": 2.5340003967285156, + "learning_rate": 1.9851869422280887e-05, + "loss": 1.0793, + "step": 71 + }, + { + "epoch": 0.12834224598930483, + "grad_norm": 2.8008666038513184, + "learning_rate": 1.9846985397842428e-05, + "loss": 1.1166, + "step": 72 + }, + { + "epoch": 0.13012477718360071, + "grad_norm": 3.1304714679718018, + "learning_rate": 1.9842022778719277e-05, + "loss": 1.0872, + "step": 73 + }, + { + "epoch": 0.1319073083778966, + "grad_norm": 2.276888608932495, + "learning_rate": 1.9836981604521077e-05, + "loss": 1.0454, + "step": 74 + }, + { + "epoch": 0.13368983957219252, + "grad_norm": 2.865384578704834, + "learning_rate": 1.9831861915484457e-05, + "loss": 1.1022, + "step": 75 + }, + { + "epoch": 0.1354723707664884, + "grad_norm": 2.901850938796997, + "learning_rate": 1.9826663752472716e-05, + "loss": 1.0293, + "step": 76 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 2.7473998069763184, + "learning_rate": 1.982138715697551e-05, + "loss": 1.1318, + "step": 77 + }, + { + "epoch": 0.13903743315508021, + "grad_norm": 2.841933488845825, + "learning_rate": 1.98160321711085e-05, + "loss": 1.0665, + "step": 78 + }, + { + "epoch": 0.1408199643493761, + "grad_norm": 2.8788764476776123, + "learning_rate": 1.9810598837613034e-05, + "loss": 1.1207, + "step": 79 + }, + { + "epoch": 0.14260249554367202, + "grad_norm": 2.6716983318328857, + "learning_rate": 1.9805087199855807e-05, + "loss": 1.1143, + "step": 80 + }, + { + "epoch": 0.1443850267379679, + "grad_norm": 2.925053834915161, + "learning_rate": 1.979949730182849e-05, + "loss": 1.1319, + "step": 81 + }, + { + "epoch": 0.14616755793226383, + "grad_norm": 2.625619411468506, + "learning_rate": 1.9793829188147406e-05, + "loss": 1.0684, + "step": 82 + }, + { + "epoch": 0.14795008912655971, + "grad_norm": 2.7794134616851807, + "learning_rate": 1.9788082904053168e-05, + "loss": 1.0967, + "step": 83 + }, + { + "epoch": 0.1497326203208556, + "grad_norm": 2.9480831623077393, + "learning_rate": 1.9782258495410306e-05, + "loss": 1.0883, + "step": 84 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 2.7370266914367676, + "learning_rate": 1.977635600870691e-05, + "loss": 1.1109, + "step": 85 + }, + { + "epoch": 0.1532976827094474, + "grad_norm": 2.5123989582061768, + "learning_rate": 1.9770375491054264e-05, + "loss": 1.0783, + "step": 86 + }, + { + "epoch": 0.15508021390374332, + "grad_norm": 2.7342569828033447, + "learning_rate": 1.976431699018646e-05, + "loss": 1.1205, + "step": 87 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 2.8322882652282715, + "learning_rate": 1.975818055446001e-05, + "loss": 1.0659, + "step": 88 + }, + { + "epoch": 0.1586452762923351, + "grad_norm": 2.488312244415283, + "learning_rate": 1.975196623285349e-05, + "loss": 1.1361, + "step": 89 + }, + { + "epoch": 0.16042780748663102, + "grad_norm": 3.0017895698547363, + "learning_rate": 1.974567407496712e-05, + "loss": 1.1512, + "step": 90 + }, + { + "epoch": 0.1622103386809269, + "grad_norm": 2.684353828430176, + "learning_rate": 1.9739304131022377e-05, + "loss": 1.0893, + "step": 91 + }, + { + "epoch": 0.16399286987522282, + "grad_norm": 2.4697201251983643, + "learning_rate": 1.9732856451861596e-05, + "loss": 0.986, + "step": 92 + }, + { + "epoch": 0.1657754010695187, + "grad_norm": 2.581225872039795, + "learning_rate": 1.9726331088947563e-05, + "loss": 1.0817, + "step": 93 + }, + { + "epoch": 0.16755793226381463, + "grad_norm": 2.7253096103668213, + "learning_rate": 1.9719728094363103e-05, + "loss": 1.0637, + "step": 94 + }, + { + "epoch": 0.16934046345811052, + "grad_norm": 2.6325371265411377, + "learning_rate": 1.9713047520810677e-05, + "loss": 1.0655, + "step": 95 + }, + { + "epoch": 0.1711229946524064, + "grad_norm": 3.3519203662872314, + "learning_rate": 1.970628942161193e-05, + "loss": 1.0285, + "step": 96 + }, + { + "epoch": 0.17290552584670232, + "grad_norm": 2.5676865577697754, + "learning_rate": 1.969945385070731e-05, + "loss": 1.1274, + "step": 97 + }, + { + "epoch": 0.1746880570409982, + "grad_norm": 2.7812039852142334, + "learning_rate": 1.9692540862655587e-05, + "loss": 1.1341, + "step": 98 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 2.6595091819763184, + "learning_rate": 1.9685550512633464e-05, + "loss": 1.1443, + "step": 99 + }, + { + "epoch": 0.17825311942959002, + "grad_norm": 2.484269142150879, + "learning_rate": 1.9678482856435107e-05, + "loss": 1.0319, + "step": 100 + }, + { + "epoch": 0.1800356506238859, + "grad_norm": 2.556206703186035, + "learning_rate": 1.9671337950471713e-05, + "loss": 1.0688, + "step": 101 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 2.646878480911255, + "learning_rate": 1.966411585177105e-05, + "loss": 1.1072, + "step": 102 + }, + { + "epoch": 0.1836007130124777, + "grad_norm": 2.8139848709106445, + "learning_rate": 1.9656816617977012e-05, + "loss": 1.0723, + "step": 103 + }, + { + "epoch": 0.18538324420677363, + "grad_norm": 2.6773009300231934, + "learning_rate": 1.9649440307349156e-05, + "loss": 1.1483, + "step": 104 + }, + { + "epoch": 0.18716577540106952, + "grad_norm": 2.9415299892425537, + "learning_rate": 1.9641986978762228e-05, + "loss": 1.1437, + "step": 105 + }, + { + "epoch": 0.1889483065953654, + "grad_norm": 2.647825241088867, + "learning_rate": 1.9634456691705705e-05, + "loss": 1.0377, + "step": 106 + }, + { + "epoch": 0.19073083778966132, + "grad_norm": 2.5940890312194824, + "learning_rate": 1.9626849506283316e-05, + "loss": 1.0183, + "step": 107 + }, + { + "epoch": 0.1925133689839572, + "grad_norm": 2.4696593284606934, + "learning_rate": 1.9619165483212565e-05, + "loss": 1.1108, + "step": 108 + }, + { + "epoch": 0.19429590017825313, + "grad_norm": 2.620328903198242, + "learning_rate": 1.9611404683824234e-05, + "loss": 1.0566, + "step": 109 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.8399813175201416, + "learning_rate": 1.9603567170061918e-05, + "loss": 1.0877, + "step": 110 + }, + { + "epoch": 0.19786096256684493, + "grad_norm": 2.864955425262451, + "learning_rate": 1.9595653004481493e-05, + "loss": 1.1072, + "step": 111 + }, + { + "epoch": 0.19964349376114082, + "grad_norm": 2.5178327560424805, + "learning_rate": 1.958766225025066e-05, + "loss": 1.052, + "step": 112 + }, + { + "epoch": 0.2014260249554367, + "grad_norm": 2.9980790615081787, + "learning_rate": 1.957959497114841e-05, + "loss": 1.1248, + "step": 113 + }, + { + "epoch": 0.20320855614973263, + "grad_norm": 2.472766637802124, + "learning_rate": 1.9571451231564523e-05, + "loss": 1.0617, + "step": 114 + }, + { + "epoch": 0.20499108734402852, + "grad_norm": 2.4862942695617676, + "learning_rate": 1.9563231096499066e-05, + "loss": 0.9894, + "step": 115 + }, + { + "epoch": 0.20677361853832443, + "grad_norm": 2.491788387298584, + "learning_rate": 1.955493463156185e-05, + "loss": 0.9946, + "step": 116 + }, + { + "epoch": 0.20855614973262032, + "grad_norm": 2.5384109020233154, + "learning_rate": 1.9546561902971935e-05, + "loss": 1.0207, + "step": 117 + }, + { + "epoch": 0.2103386809269162, + "grad_norm": 2.386254072189331, + "learning_rate": 1.9538112977557077e-05, + "loss": 1.0857, + "step": 118 + }, + { + "epoch": 0.21212121212121213, + "grad_norm": 2.649524688720703, + "learning_rate": 1.9529587922753205e-05, + "loss": 1.0267, + "step": 119 + }, + { + "epoch": 0.21390374331550802, + "grad_norm": 2.55436110496521, + "learning_rate": 1.9520986806603882e-05, + "loss": 1.1293, + "step": 120 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 2.5951426029205322, + "learning_rate": 1.951230969775977e-05, + "loss": 0.973, + "step": 121 + }, + { + "epoch": 0.21746880570409982, + "grad_norm": 3.4098243713378906, + "learning_rate": 1.9503556665478066e-05, + "loss": 1.1334, + "step": 122 + }, + { + "epoch": 0.2192513368983957, + "grad_norm": 2.6225473880767822, + "learning_rate": 1.949472777962196e-05, + "loss": 1.094, + "step": 123 + }, + { + "epoch": 0.22103386809269163, + "grad_norm": 2.8392271995544434, + "learning_rate": 1.948582311066008e-05, + "loss": 1.0163, + "step": 124 + }, + { + "epoch": 0.22281639928698752, + "grad_norm": 2.559697389602661, + "learning_rate": 1.9476842729665912e-05, + "loss": 1.0216, + "step": 125 + }, + { + "epoch": 0.22459893048128343, + "grad_norm": 2.266484498977661, + "learning_rate": 1.9467786708317257e-05, + "loss": 0.9825, + "step": 126 + }, + { + "epoch": 0.22638146167557932, + "grad_norm": 2.2157444953918457, + "learning_rate": 1.9458655118895634e-05, + "loss": 1.0281, + "step": 127 + }, + { + "epoch": 0.2281639928698752, + "grad_norm": 2.868452787399292, + "learning_rate": 1.9449448034285737e-05, + "loss": 1.148, + "step": 128 + }, + { + "epoch": 0.22994652406417113, + "grad_norm": 2.490839958190918, + "learning_rate": 1.9440165527974808e-05, + "loss": 1.0174, + "step": 129 + }, + { + "epoch": 0.23172905525846701, + "grad_norm": 2.3830602169036865, + "learning_rate": 1.9430807674052092e-05, + "loss": 1.0219, + "step": 130 + }, + { + "epoch": 0.23351158645276293, + "grad_norm": 2.284559488296509, + "learning_rate": 1.9421374547208223e-05, + "loss": 0.9775, + "step": 131 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 2.1950523853302, + "learning_rate": 1.941186622273463e-05, + "loss": 1.0513, + "step": 132 + }, + { + "epoch": 0.23707664884135474, + "grad_norm": 2.3674473762512207, + "learning_rate": 1.9402282776522943e-05, + "loss": 1.0465, + "step": 133 + }, + { + "epoch": 0.23885918003565063, + "grad_norm": 2.5275073051452637, + "learning_rate": 1.939262428506438e-05, + "loss": 1.0873, + "step": 134 + }, + { + "epoch": 0.24064171122994651, + "grad_norm": 2.838315486907959, + "learning_rate": 1.938289082544915e-05, + "loss": 1.0799, + "step": 135 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 2.5094432830810547, + "learning_rate": 1.937308247536582e-05, + "loss": 1.0696, + "step": 136 + }, + { + "epoch": 0.24420677361853832, + "grad_norm": 2.2962303161621094, + "learning_rate": 1.9363199313100693e-05, + "loss": 1.0703, + "step": 137 + }, + { + "epoch": 0.24598930481283424, + "grad_norm": 2.6103367805480957, + "learning_rate": 1.9353241417537216e-05, + "loss": 1.0648, + "step": 138 + }, + { + "epoch": 0.24777183600713013, + "grad_norm": 2.8506877422332764, + "learning_rate": 1.93432088681553e-05, + "loss": 1.1143, + "step": 139 + }, + { + "epoch": 0.24955436720142601, + "grad_norm": 2.5705273151397705, + "learning_rate": 1.9333101745030735e-05, + "loss": 1.1011, + "step": 140 + }, + { + "epoch": 0.25133689839572193, + "grad_norm": 2.8784642219543457, + "learning_rate": 1.9322920128834527e-05, + "loss": 0.9982, + "step": 141 + }, + { + "epoch": 0.25133689839572193, + "eval_loss": 1.0543876886367798, + "eval_runtime": 329.8866, + "eval_samples_per_second": 8.918, + "eval_steps_per_second": 1.116, + "step": 141 + }, + { + "epoch": 0.2531194295900178, + "grad_norm": 3.0359432697296143, + "learning_rate": 1.9312664100832236e-05, + "loss": 1.0954, + "step": 142 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 2.64568829536438, + "learning_rate": 1.930233374288337e-05, + "loss": 1.0921, + "step": 143 + }, + { + "epoch": 0.25668449197860965, + "grad_norm": 2.968031644821167, + "learning_rate": 1.929192913744069e-05, + "loss": 1.1665, + "step": 144 + }, + { + "epoch": 0.25846702317290554, + "grad_norm": 2.6442747116088867, + "learning_rate": 1.9281450367549593e-05, + "loss": 1.0702, + "step": 145 + }, + { + "epoch": 0.26024955436720143, + "grad_norm": 2.722597360610962, + "learning_rate": 1.9270897516847406e-05, + "loss": 1.0307, + "step": 146 + }, + { + "epoch": 0.2620320855614973, + "grad_norm": 2.6280815601348877, + "learning_rate": 1.9260270669562747e-05, + "loss": 1.0478, + "step": 147 + }, + { + "epoch": 0.2638146167557932, + "grad_norm": 2.531064033508301, + "learning_rate": 1.9249569910514846e-05, + "loss": 1.0316, + "step": 148 + }, + { + "epoch": 0.26559714795008915, + "grad_norm": 2.552273750305176, + "learning_rate": 1.9238795325112867e-05, + "loss": 1.0622, + "step": 149 + }, + { + "epoch": 0.26737967914438504, + "grad_norm": 2.358672618865967, + "learning_rate": 1.9227946999355226e-05, + "loss": 1.0164, + "step": 150 + }, + { + "epoch": 0.26916221033868093, + "grad_norm": 2.6471526622772217, + "learning_rate": 1.9217025019828907e-05, + "loss": 1.0108, + "step": 151 + }, + { + "epoch": 0.2709447415329768, + "grad_norm": 2.467954397201538, + "learning_rate": 1.920602947370876e-05, + "loss": 1.0493, + "step": 152 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 2.6386890411376953, + "learning_rate": 1.9194960448756824e-05, + "loss": 1.0464, + "step": 153 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 2.795897960662842, + "learning_rate": 1.9183818033321612e-05, + "loss": 1.0289, + "step": 154 + }, + { + "epoch": 0.27629233511586454, + "grad_norm": 2.4060723781585693, + "learning_rate": 1.9172602316337413e-05, + "loss": 1.0661, + "step": 155 + }, + { + "epoch": 0.27807486631016043, + "grad_norm": 2.434025526046753, + "learning_rate": 1.9161313387323574e-05, + "loss": 1.0468, + "step": 156 + }, + { + "epoch": 0.2798573975044563, + "grad_norm": 2.5258090496063232, + "learning_rate": 1.9149951336383798e-05, + "loss": 1.0225, + "step": 157 + }, + { + "epoch": 0.2816399286987522, + "grad_norm": 2.409041166305542, + "learning_rate": 1.9138516254205416e-05, + "loss": 0.9958, + "step": 158 + }, + { + "epoch": 0.28342245989304815, + "grad_norm": 2.4306955337524414, + "learning_rate": 1.912700823205866e-05, + "loss": 1.0155, + "step": 159 + }, + { + "epoch": 0.28520499108734404, + "grad_norm": 2.49703311920166, + "learning_rate": 1.911542736179594e-05, + "loss": 1.0727, + "step": 160 + }, + { + "epoch": 0.28698752228163993, + "grad_norm": 2.567338228225708, + "learning_rate": 1.910377373585113e-05, + "loss": 1.0044, + "step": 161 + }, + { + "epoch": 0.2887700534759358, + "grad_norm": 2.35740327835083, + "learning_rate": 1.9092047447238775e-05, + "loss": 1.0163, + "step": 162 + }, + { + "epoch": 0.2905525846702317, + "grad_norm": 2.319448709487915, + "learning_rate": 1.908024858955341e-05, + "loss": 0.9863, + "step": 163 + }, + { + "epoch": 0.29233511586452765, + "grad_norm": 2.6530685424804688, + "learning_rate": 1.9068377256968782e-05, + "loss": 1.0428, + "step": 164 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 2.4042210578918457, + "learning_rate": 1.9056433544237106e-05, + "loss": 1.026, + "step": 165 + }, + { + "epoch": 0.29590017825311943, + "grad_norm": 2.575970411300659, + "learning_rate": 1.9044417546688295e-05, + "loss": 1.0751, + "step": 166 + }, + { + "epoch": 0.2976827094474153, + "grad_norm": 2.477625608444214, + "learning_rate": 1.9032329360229225e-05, + "loss": 1.0335, + "step": 167 + }, + { + "epoch": 0.2994652406417112, + "grad_norm": 2.2049059867858887, + "learning_rate": 1.9020169081342942e-05, + "loss": 0.9878, + "step": 168 + }, + { + "epoch": 0.30124777183600715, + "grad_norm": 2.736179828643799, + "learning_rate": 1.900793680708792e-05, + "loss": 1.0221, + "step": 169 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 2.3086302280426025, + "learning_rate": 1.899563263509725e-05, + "loss": 1.023, + "step": 170 + }, + { + "epoch": 0.3048128342245989, + "grad_norm": 2.41434645652771, + "learning_rate": 1.8983256663577898e-05, + "loss": 1.0016, + "step": 171 + }, + { + "epoch": 0.3065953654188948, + "grad_norm": 2.048405647277832, + "learning_rate": 1.8970808991309905e-05, + "loss": 0.9322, + "step": 172 + }, + { + "epoch": 0.3083778966131907, + "grad_norm": 2.6779351234436035, + "learning_rate": 1.895828971764559e-05, + "loss": 1.06, + "step": 173 + }, + { + "epoch": 0.31016042780748665, + "grad_norm": 2.5748863220214844, + "learning_rate": 1.894569894250877e-05, + "loss": 1.0392, + "step": 174 + }, + { + "epoch": 0.31194295900178254, + "grad_norm": 2.1274595260620117, + "learning_rate": 1.8933036766393962e-05, + "loss": 1.0284, + "step": 175 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 2.525455951690674, + "learning_rate": 1.8920303290365568e-05, + "loss": 1.0964, + "step": 176 + }, + { + "epoch": 0.3155080213903743, + "grad_norm": 2.5733320713043213, + "learning_rate": 1.8907498616057084e-05, + "loss": 1.032, + "step": 177 + }, + { + "epoch": 0.3172905525846702, + "grad_norm": 2.2275173664093018, + "learning_rate": 1.8894622845670282e-05, + "loss": 1.0473, + "step": 178 + }, + { + "epoch": 0.31907308377896615, + "grad_norm": 2.2616934776306152, + "learning_rate": 1.888167608197439e-05, + "loss": 1.0048, + "step": 179 + }, + { + "epoch": 0.32085561497326204, + "grad_norm": 2.0974690914154053, + "learning_rate": 1.886865842830528e-05, + "loss": 1.0566, + "step": 180 + }, + { + "epoch": 0.3226381461675579, + "grad_norm": 2.5724925994873047, + "learning_rate": 1.8855569988564636e-05, + "loss": 1.0125, + "step": 181 + }, + { + "epoch": 0.3244206773618538, + "grad_norm": 2.321138620376587, + "learning_rate": 1.8842410867219137e-05, + "loss": 1.0347, + "step": 182 + }, + { + "epoch": 0.32620320855614976, + "grad_norm": 2.638758420944214, + "learning_rate": 1.8829181169299596e-05, + "loss": 1.095, + "step": 183 + }, + { + "epoch": 0.32798573975044565, + "grad_norm": 2.51639986038208, + "learning_rate": 1.8815881000400164e-05, + "loss": 1.0083, + "step": 184 + }, + { + "epoch": 0.32976827094474154, + "grad_norm": 2.4949166774749756, + "learning_rate": 1.880251046667744e-05, + "loss": 1.0022, + "step": 185 + }, + { + "epoch": 0.3315508021390374, + "grad_norm": 2.2100119590759277, + "learning_rate": 1.878906967484966e-05, + "loss": 1.0869, + "step": 186 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 2.4192895889282227, + "learning_rate": 1.877555873219583e-05, + "loss": 0.9759, + "step": 187 + }, + { + "epoch": 0.33511586452762926, + "grad_norm": 2.273297071456909, + "learning_rate": 1.876197774655487e-05, + "loss": 1.0539, + "step": 188 + }, + { + "epoch": 0.33689839572192515, + "grad_norm": 2.259113311767578, + "learning_rate": 1.874832682632476e-05, + "loss": 0.9344, + "step": 189 + }, + { + "epoch": 0.33868092691622104, + "grad_norm": 2.2211596965789795, + "learning_rate": 1.8734606080461657e-05, + "loss": 1.0609, + "step": 190 + }, + { + "epoch": 0.3404634581105169, + "grad_norm": 2.131674289703369, + "learning_rate": 1.8720815618479053e-05, + "loss": 0.999, + "step": 191 + }, + { + "epoch": 0.3422459893048128, + "grad_norm": 2.7799980640411377, + "learning_rate": 1.870695555044688e-05, + "loss": 1.1119, + "step": 192 + }, + { + "epoch": 0.34402852049910876, + "grad_norm": 2.1310036182403564, + "learning_rate": 1.869302598699063e-05, + "loss": 1.0332, + "step": 193 + }, + { + "epoch": 0.34581105169340465, + "grad_norm": 2.501399040222168, + "learning_rate": 1.86790270392905e-05, + "loss": 0.971, + "step": 194 + }, + { + "epoch": 0.34759358288770054, + "grad_norm": 2.521914482116699, + "learning_rate": 1.866495881908046e-05, + "loss": 0.9663, + "step": 195 + }, + { + "epoch": 0.3493761140819964, + "grad_norm": 2.6178207397460938, + "learning_rate": 1.86508214386474e-05, + "loss": 1.0189, + "step": 196 + }, + { + "epoch": 0.3511586452762923, + "grad_norm": 2.4940268993377686, + "learning_rate": 1.8636615010830216e-05, + "loss": 0.9989, + "step": 197 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 2.109654188156128, + "learning_rate": 1.8622339649018907e-05, + "loss": 1.0024, + "step": 198 + }, + { + "epoch": 0.35472370766488415, + "grad_norm": 2.862342596054077, + "learning_rate": 1.8607995467153692e-05, + "loss": 1.0465, + "step": 199 + }, + { + "epoch": 0.35650623885918004, + "grad_norm": 2.5327987670898438, + "learning_rate": 1.8593582579724062e-05, + "loss": 0.9383, + "step": 200 + }, + { + "epoch": 0.3582887700534759, + "grad_norm": 2.031452178955078, + "learning_rate": 1.8579101101767904e-05, + "loss": 1.0371, + "step": 201 + }, + { + "epoch": 0.3600713012477718, + "grad_norm": 2.711760997772217, + "learning_rate": 1.856455114887056e-05, + "loss": 1.0148, + "step": 202 + }, + { + "epoch": 0.36185383244206776, + "grad_norm": 2.1266770362854004, + "learning_rate": 1.8549932837163917e-05, + "loss": 0.9999, + "step": 203 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 2.262907028198242, + "learning_rate": 1.8535246283325468e-05, + "loss": 0.9768, + "step": 204 + }, + { + "epoch": 0.36541889483065954, + "grad_norm": 2.307704210281372, + "learning_rate": 1.8520491604577388e-05, + "loss": 1.0673, + "step": 205 + }, + { + "epoch": 0.3672014260249554, + "grad_norm": 2.203144073486328, + "learning_rate": 1.8505668918685603e-05, + "loss": 1.0154, + "step": 206 + }, + { + "epoch": 0.3689839572192513, + "grad_norm": 2.0945725440979004, + "learning_rate": 1.849077834395884e-05, + "loss": 1.0539, + "step": 207 + }, + { + "epoch": 0.37076648841354726, + "grad_norm": 2.8280982971191406, + "learning_rate": 1.8475819999247694e-05, + "loss": 1.0071, + "step": 208 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 2.0130624771118164, + "learning_rate": 1.8460794003943658e-05, + "loss": 0.9843, + "step": 209 + }, + { + "epoch": 0.37433155080213903, + "grad_norm": 2.186617612838745, + "learning_rate": 1.8445700477978207e-05, + "loss": 1.0136, + "step": 210 + }, + { + "epoch": 0.3761140819964349, + "grad_norm": 2.1577677726745605, + "learning_rate": 1.8430539541821795e-05, + "loss": 1.0273, + "step": 211 + }, + { + "epoch": 0.3778966131907308, + "grad_norm": 2.9277737140655518, + "learning_rate": 1.8415311316482937e-05, + "loss": 0.9436, + "step": 212 + }, + { + "epoch": 0.37967914438502676, + "grad_norm": 2.1575350761413574, + "learning_rate": 1.840001592350721e-05, + "loss": 1.0307, + "step": 213 + }, + { + "epoch": 0.38146167557932265, + "grad_norm": 2.3168020248413086, + "learning_rate": 1.8384653484976305e-05, + "loss": 0.9642, + "step": 214 + }, + { + "epoch": 0.38324420677361853, + "grad_norm": 2.5893986225128174, + "learning_rate": 1.8369224123507035e-05, + "loss": 1.0545, + "step": 215 + }, + { + "epoch": 0.3850267379679144, + "grad_norm": 2.237194538116455, + "learning_rate": 1.835372796225037e-05, + "loss": 1.0229, + "step": 216 + }, + { + "epoch": 0.3868092691622103, + "grad_norm": 2.373006582260132, + "learning_rate": 1.8338165124890455e-05, + "loss": 1.0323, + "step": 217 + }, + { + "epoch": 0.38859180035650626, + "grad_norm": 2.0743327140808105, + "learning_rate": 1.8322535735643604e-05, + "loss": 1.0576, + "step": 218 + }, + { + "epoch": 0.39037433155080214, + "grad_norm": 2.5559821128845215, + "learning_rate": 1.830683991925733e-05, + "loss": 1.0307, + "step": 219 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.2722411155700684, + "learning_rate": 1.829107780100934e-05, + "loss": 0.9956, + "step": 220 + }, + { + "epoch": 0.3939393939393939, + "grad_norm": 2.3079795837402344, + "learning_rate": 1.8275249506706535e-05, + "loss": 1.0553, + "step": 221 + }, + { + "epoch": 0.39572192513368987, + "grad_norm": 2.027052402496338, + "learning_rate": 1.8259355162684e-05, + "loss": 1.0089, + "step": 222 + }, + { + "epoch": 0.39750445632798576, + "grad_norm": 2.2788703441619873, + "learning_rate": 1.8243394895804012e-05, + "loss": 0.9516, + "step": 223 + }, + { + "epoch": 0.39928698752228164, + "grad_norm": 2.1616568565368652, + "learning_rate": 1.8227368833455023e-05, + "loss": 0.9019, + "step": 224 + }, + { + "epoch": 0.40106951871657753, + "grad_norm": 2.5529351234436035, + "learning_rate": 1.821127710355062e-05, + "loss": 1.0444, + "step": 225 + }, + { + "epoch": 0.4028520499108734, + "grad_norm": 2.000537872314453, + "learning_rate": 1.8195119834528535e-05, + "loss": 0.9659, + "step": 226 + }, + { + "epoch": 0.40463458110516937, + "grad_norm": 2.050112724304199, + "learning_rate": 1.8178897155349598e-05, + "loss": 1.0564, + "step": 227 + }, + { + "epoch": 0.40641711229946526, + "grad_norm": 1.993181586265564, + "learning_rate": 1.816260919549673e-05, + "loss": 0.9761, + "step": 228 + }, + { + "epoch": 0.40819964349376114, + "grad_norm": 2.506268262863159, + "learning_rate": 1.814625608497389e-05, + "loss": 1.0261, + "step": 229 + }, + { + "epoch": 0.40998217468805703, + "grad_norm": 2.155731439590454, + "learning_rate": 1.8129837954305033e-05, + "loss": 1.0477, + "step": 230 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 2.631788969039917, + "learning_rate": 1.8113354934533086e-05, + "loss": 1.057, + "step": 231 + }, + { + "epoch": 0.41354723707664887, + "grad_norm": 2.3771722316741943, + "learning_rate": 1.809680715721891e-05, + "loss": 1.0031, + "step": 232 + }, + { + "epoch": 0.41532976827094475, + "grad_norm": 2.1514644622802734, + "learning_rate": 1.8080194754440205e-05, + "loss": 1.0661, + "step": 233 + }, + { + "epoch": 0.41711229946524064, + "grad_norm": 2.065730333328247, + "learning_rate": 1.8063517858790517e-05, + "loss": 0.963, + "step": 234 + }, + { + "epoch": 0.41889483065953653, + "grad_norm": 2.359872341156006, + "learning_rate": 1.804677660337812e-05, + "loss": 1.0196, + "step": 235 + }, + { + "epoch": 0.4206773618538324, + "grad_norm": 2.3731632232666016, + "learning_rate": 1.8029971121824997e-05, + "loss": 0.9878, + "step": 236 + }, + { + "epoch": 0.42245989304812837, + "grad_norm": 2.6104416847229004, + "learning_rate": 1.801310154826576e-05, + "loss": 1.0736, + "step": 237 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 2.411689043045044, + "learning_rate": 1.799616801734657e-05, + "loss": 1.0152, + "step": 238 + }, + { + "epoch": 0.42602495543672014, + "grad_norm": 2.3272037506103516, + "learning_rate": 1.7979170664224078e-05, + "loss": 1.0123, + "step": 239 + }, + { + "epoch": 0.42780748663101603, + "grad_norm": 2.3518447875976562, + "learning_rate": 1.7962109624564324e-05, + "loss": 1.0213, + "step": 240 + }, + { + "epoch": 0.4295900178253119, + "grad_norm": 2.7915849685668945, + "learning_rate": 1.794498503454169e-05, + "loss": 1.1125, + "step": 241 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 2.335583448410034, + "learning_rate": 1.792779703083777e-05, + "loss": 0.935, + "step": 242 + }, + { + "epoch": 0.43315508021390375, + "grad_norm": 2.5337862968444824, + "learning_rate": 1.7910545750640317e-05, + "loss": 1.1122, + "step": 243 + }, + { + "epoch": 0.43493761140819964, + "grad_norm": 2.29020357131958, + "learning_rate": 1.7893231331642118e-05, + "loss": 0.9865, + "step": 244 + }, + { + "epoch": 0.43672014260249553, + "grad_norm": 2.0491623878479004, + "learning_rate": 1.7875853912039915e-05, + "loss": 0.9985, + "step": 245 + }, + { + "epoch": 0.4385026737967914, + "grad_norm": 2.5333423614501953, + "learning_rate": 1.7858413630533305e-05, + "loss": 1.0376, + "step": 246 + }, + { + "epoch": 0.44028520499108736, + "grad_norm": 2.1056642532348633, + "learning_rate": 1.7840910626323603e-05, + "loss": 0.9189, + "step": 247 + }, + { + "epoch": 0.44206773618538325, + "grad_norm": 2.0595109462738037, + "learning_rate": 1.7823345039112772e-05, + "loss": 1.0123, + "step": 248 + }, + { + "epoch": 0.44385026737967914, + "grad_norm": 2.0415713787078857, + "learning_rate": 1.780571700910227e-05, + "loss": 0.9969, + "step": 249 + }, + { + "epoch": 0.44563279857397503, + "grad_norm": 2.055589437484741, + "learning_rate": 1.778802667699196e-05, + "loss": 1.0098, + "step": 250 + }, + { + "epoch": 0.4474153297682709, + "grad_norm": 1.972589373588562, + "learning_rate": 1.7770274183978975e-05, + "loss": 0.9739, + "step": 251 + }, + { + "epoch": 0.44919786096256686, + "grad_norm": 1.9704911708831787, + "learning_rate": 1.775245967175658e-05, + "loss": 0.9221, + "step": 252 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 2.3809704780578613, + "learning_rate": 1.773458328251307e-05, + "loss": 1.0125, + "step": 253 + }, + { + "epoch": 0.45276292335115864, + "grad_norm": 2.554600715637207, + "learning_rate": 1.77166451589306e-05, + "loss": 1.1072, + "step": 254 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 2.169062614440918, + "learning_rate": 1.7698645444184074e-05, + "loss": 1.0075, + "step": 255 + }, + { + "epoch": 0.4563279857397504, + "grad_norm": 2.2745189666748047, + "learning_rate": 1.768058428193999e-05, + "loss": 1.0474, + "step": 256 + }, + { + "epoch": 0.45811051693404636, + "grad_norm": 2.583153247833252, + "learning_rate": 1.76624618163553e-05, + "loss": 1.0531, + "step": 257 + }, + { + "epoch": 0.45989304812834225, + "grad_norm": 2.389552116394043, + "learning_rate": 1.764427819207624e-05, + "loss": 0.9571, + "step": 258 + }, + { + "epoch": 0.46167557932263814, + "grad_norm": 2.1886773109436035, + "learning_rate": 1.762603355423721e-05, + "loss": 0.9814, + "step": 259 + }, + { + "epoch": 0.46345811051693403, + "grad_norm": 2.6514391899108887, + "learning_rate": 1.7607728048459572e-05, + "loss": 1.1205, + "step": 260 + }, + { + "epoch": 0.46524064171123, + "grad_norm": 2.1365182399749756, + "learning_rate": 1.758936182085054e-05, + "loss": 0.9833, + "step": 261 + }, + { + "epoch": 0.46702317290552586, + "grad_norm": 2.6089439392089844, + "learning_rate": 1.757093501800196e-05, + "loss": 1.0578, + "step": 262 + }, + { + "epoch": 0.46880570409982175, + "grad_norm": 2.455092668533325, + "learning_rate": 1.755244778698918e-05, + "loss": 1.0678, + "step": 263 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 2.013976573944092, + "learning_rate": 1.7533900275369857e-05, + "loss": 1.0032, + "step": 264 + }, + { + "epoch": 0.47237076648841353, + "grad_norm": 2.275420904159546, + "learning_rate": 1.7515292631182782e-05, + "loss": 0.973, + "step": 265 + }, + { + "epoch": 0.4741532976827095, + "grad_norm": 2.0543277263641357, + "learning_rate": 1.7496625002946702e-05, + "loss": 1.031, + "step": 266 + }, + { + "epoch": 0.47593582887700536, + "grad_norm": 2.4391744136810303, + "learning_rate": 1.747789753965913e-05, + "loss": 1.0207, + "step": 267 + }, + { + "epoch": 0.47771836007130125, + "grad_norm": 2.3594958782196045, + "learning_rate": 1.745911039079516e-05, + "loss": 1.0113, + "step": 268 + }, + { + "epoch": 0.47950089126559714, + "grad_norm": 2.252363681793213, + "learning_rate": 1.744026370630628e-05, + "loss": 0.9894, + "step": 269 + }, + { + "epoch": 0.48128342245989303, + "grad_norm": 2.4105823040008545, + "learning_rate": 1.7421357636619153e-05, + "loss": 1.0012, + "step": 270 + }, + { + "epoch": 0.483065953654189, + "grad_norm": 2.206552028656006, + "learning_rate": 1.7402392332634442e-05, + "loss": 1.0078, + "step": 271 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 2.089542865753174, + "learning_rate": 1.7383367945725584e-05, + "loss": 1.0526, + "step": 272 + }, + { + "epoch": 0.48663101604278075, + "grad_norm": 2.173814058303833, + "learning_rate": 1.7364284627737603e-05, + "loss": 0.9653, + "step": 273 + }, + { + "epoch": 0.48841354723707664, + "grad_norm": 2.486128568649292, + "learning_rate": 1.734514253098589e-05, + "loss": 1.0228, + "step": 274 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 2.1726722717285156, + "learning_rate": 1.732594180825496e-05, + "loss": 0.9509, + "step": 275 + }, + { + "epoch": 0.4919786096256685, + "grad_norm": 1.9146912097930908, + "learning_rate": 1.730668261279729e-05, + "loss": 0.9773, + "step": 276 + }, + { + "epoch": 0.49376114081996436, + "grad_norm": 2.7147133350372314, + "learning_rate": 1.7287365098332042e-05, + "loss": 1.0312, + "step": 277 + }, + { + "epoch": 0.49554367201426025, + "grad_norm": 2.2072091102600098, + "learning_rate": 1.726798941904386e-05, + "loss": 1.0854, + "step": 278 + }, + { + "epoch": 0.49732620320855614, + "grad_norm": 2.2345147132873535, + "learning_rate": 1.724855572958164e-05, + "loss": 0.9868, + "step": 279 + }, + { + "epoch": 0.49910873440285203, + "grad_norm": 2.1517088413238525, + "learning_rate": 1.722906418505729e-05, + "loss": 0.9413, + "step": 280 + }, + { + "epoch": 0.5008912655971479, + "grad_norm": 2.3915605545043945, + "learning_rate": 1.7209514941044494e-05, + "loss": 0.9697, + "step": 281 + }, + { + "epoch": 0.5026737967914439, + "grad_norm": 2.306183099746704, + "learning_rate": 1.7189908153577473e-05, + "loss": 1.0336, + "step": 282 + }, + { + "epoch": 0.5026737967914439, + "eval_loss": 1.0013747215270996, + "eval_runtime": 329.9461, + "eval_samples_per_second": 8.917, + "eval_steps_per_second": 1.115, + "step": 282 + }, + { + "epoch": 0.5044563279857398, + "grad_norm": 2.168407917022705, + "learning_rate": 1.717024397914973e-05, + "loss": 0.932, + "step": 283 + }, + { + "epoch": 0.5062388591800356, + "grad_norm": 2.4041762351989746, + "learning_rate": 1.7150522574712815e-05, + "loss": 1.0129, + "step": 284 + }, + { + "epoch": 0.5080213903743316, + "grad_norm": 2.6760098934173584, + "learning_rate": 1.7130744097675058e-05, + "loss": 1.0125, + "step": 285 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 2.7730419635772705, + "learning_rate": 1.7110908705900322e-05, + "loss": 1.021, + "step": 286 + }, + { + "epoch": 0.5115864527629234, + "grad_norm": 2.342268943786621, + "learning_rate": 1.7091016557706747e-05, + "loss": 1.0381, + "step": 287 + }, + { + "epoch": 0.5133689839572193, + "grad_norm": 2.5511789321899414, + "learning_rate": 1.7071067811865477e-05, + "loss": 1.0157, + "step": 288 + }, + { + "epoch": 0.5151515151515151, + "grad_norm": 2.5863358974456787, + "learning_rate": 1.7051062627599385e-05, + "loss": 1.0438, + "step": 289 + }, + { + "epoch": 0.5169340463458111, + "grad_norm": 2.2332570552825928, + "learning_rate": 1.7031001164581828e-05, + "loss": 1.0236, + "step": 290 + }, + { + "epoch": 0.5187165775401069, + "grad_norm": 1.9721466302871704, + "learning_rate": 1.701088358293535e-05, + "loss": 0.9639, + "step": 291 + }, + { + "epoch": 0.5204991087344029, + "grad_norm": 2.1957123279571533, + "learning_rate": 1.6990710043230407e-05, + "loss": 0.9288, + "step": 292 + }, + { + "epoch": 0.5222816399286988, + "grad_norm": 2.659257411956787, + "learning_rate": 1.69704807064841e-05, + "loss": 1.1005, + "step": 293 + }, + { + "epoch": 0.5240641711229946, + "grad_norm": 2.452608346939087, + "learning_rate": 1.6950195734158874e-05, + "loss": 0.9854, + "step": 294 + }, + { + "epoch": 0.5258467023172906, + "grad_norm": 2.114299774169922, + "learning_rate": 1.6929855288161234e-05, + "loss": 0.9477, + "step": 295 + }, + { + "epoch": 0.5276292335115864, + "grad_norm": 2.170616865158081, + "learning_rate": 1.6909459530840457e-05, + "loss": 1.0347, + "step": 296 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 2.016688823699951, + "learning_rate": 1.6889008624987284e-05, + "loss": 0.9756, + "step": 297 + }, + { + "epoch": 0.5311942959001783, + "grad_norm": 2.1537399291992188, + "learning_rate": 1.6868502733832647e-05, + "loss": 1.104, + "step": 298 + }, + { + "epoch": 0.5329768270944741, + "grad_norm": 2.0217056274414062, + "learning_rate": 1.684794202104633e-05, + "loss": 0.9949, + "step": 299 + }, + { + "epoch": 0.5347593582887701, + "grad_norm": 1.9496427774429321, + "learning_rate": 1.682732665073569e-05, + "loss": 1.0041, + "step": 300 + }, + { + "epoch": 0.5365418894830659, + "grad_norm": 2.2652711868286133, + "learning_rate": 1.6806656787444338e-05, + "loss": 0.985, + "step": 301 + }, + { + "epoch": 0.5383244206773619, + "grad_norm": 2.223661184310913, + "learning_rate": 1.6785932596150827e-05, + "loss": 0.9828, + "step": 302 + }, + { + "epoch": 0.5401069518716578, + "grad_norm": 2.382096767425537, + "learning_rate": 1.6765154242267328e-05, + "loss": 1.0278, + "step": 303 + }, + { + "epoch": 0.5418894830659536, + "grad_norm": 2.034925699234009, + "learning_rate": 1.6744321891638328e-05, + "loss": 0.9006, + "step": 304 + }, + { + "epoch": 0.5436720142602496, + "grad_norm": 2.47273588180542, + "learning_rate": 1.6723435710539286e-05, + "loss": 1.0393, + "step": 305 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 2.252715587615967, + "learning_rate": 1.670249586567531e-05, + "loss": 0.9549, + "step": 306 + }, + { + "epoch": 0.5472370766488414, + "grad_norm": 2.026567220687866, + "learning_rate": 1.668150252417984e-05, + "loss": 1.0656, + "step": 307 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 2.188436508178711, + "learning_rate": 1.6660455853613303e-05, + "loss": 0.9633, + "step": 308 + }, + { + "epoch": 0.5508021390374331, + "grad_norm": 2.008063316345215, + "learning_rate": 1.6639356021961767e-05, + "loss": 0.9901, + "step": 309 + }, + { + "epoch": 0.5525846702317291, + "grad_norm": 2.1796090602874756, + "learning_rate": 1.6618203197635624e-05, + "loss": 0.983, + "step": 310 + }, + { + "epoch": 0.5543672014260249, + "grad_norm": 2.1821656227111816, + "learning_rate": 1.659699754946823e-05, + "loss": 0.9757, + "step": 311 + }, + { + "epoch": 0.5561497326203209, + "grad_norm": 2.1550426483154297, + "learning_rate": 1.657573924671455e-05, + "loss": 1.0422, + "step": 312 + }, + { + "epoch": 0.5579322638146168, + "grad_norm": 2.2479467391967773, + "learning_rate": 1.6554428459049826e-05, + "loss": 0.9551, + "step": 313 + }, + { + "epoch": 0.5597147950089126, + "grad_norm": 2.107180595397949, + "learning_rate": 1.6533065356568206e-05, + "loss": 1.0978, + "step": 314 + }, + { + "epoch": 0.5614973262032086, + "grad_norm": 2.159715175628662, + "learning_rate": 1.651165010978141e-05, + "loss": 0.9501, + "step": 315 + }, + { + "epoch": 0.5632798573975044, + "grad_norm": 2.454437494277954, + "learning_rate": 1.6490182889617326e-05, + "loss": 0.9584, + "step": 316 + }, + { + "epoch": 0.5650623885918004, + "grad_norm": 2.113189458847046, + "learning_rate": 1.6468663867418705e-05, + "loss": 0.9517, + "step": 317 + }, + { + "epoch": 0.5668449197860963, + "grad_norm": 2.0066540241241455, + "learning_rate": 1.6447093214941727e-05, + "loss": 0.9769, + "step": 318 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 2.08366322517395, + "learning_rate": 1.6425471104354694e-05, + "loss": 0.9421, + "step": 319 + }, + { + "epoch": 0.5704099821746881, + "grad_norm": 1.937745213508606, + "learning_rate": 1.6403797708236603e-05, + "loss": 0.9922, + "step": 320 + }, + { + "epoch": 0.5721925133689839, + "grad_norm": 1.8456331491470337, + "learning_rate": 1.6382073199575816e-05, + "loss": 0.9445, + "step": 321 + }, + { + "epoch": 0.5739750445632799, + "grad_norm": 2.1327157020568848, + "learning_rate": 1.636029775176862e-05, + "loss": 1.0098, + "step": 322 + }, + { + "epoch": 0.5757575757575758, + "grad_norm": 2.1541051864624023, + "learning_rate": 1.6338471538617918e-05, + "loss": 1.0162, + "step": 323 + }, + { + "epoch": 0.5775401069518716, + "grad_norm": 1.9894920587539673, + "learning_rate": 1.631659473433177e-05, + "loss": 1.0404, + "step": 324 + }, + { + "epoch": 0.5793226381461676, + "grad_norm": 2.2389776706695557, + "learning_rate": 1.6294667513522054e-05, + "loss": 1.0277, + "step": 325 + }, + { + "epoch": 0.5811051693404634, + "grad_norm": 2.7732322216033936, + "learning_rate": 1.627269005120304e-05, + "loss": 0.9646, + "step": 326 + }, + { + "epoch": 0.5828877005347594, + "grad_norm": 2.0119757652282715, + "learning_rate": 1.625066252279001e-05, + "loss": 1.0093, + "step": 327 + }, + { + "epoch": 0.5846702317290553, + "grad_norm": 1.7421563863754272, + "learning_rate": 1.622858510409785e-05, + "loss": 0.9558, + "step": 328 + }, + { + "epoch": 0.5864527629233511, + "grad_norm": 2.3108606338500977, + "learning_rate": 1.620645797133966e-05, + "loss": 0.9656, + "step": 329 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 2.1272060871124268, + "learning_rate": 1.618428130112533e-05, + "loss": 1.0287, + "step": 330 + }, + { + "epoch": 0.5900178253119429, + "grad_norm": 2.0039730072021484, + "learning_rate": 1.616205527046014e-05, + "loss": 0.9728, + "step": 331 + }, + { + "epoch": 0.5918003565062389, + "grad_norm": 2.106395721435547, + "learning_rate": 1.6139780056743343e-05, + "loss": 1.0319, + "step": 332 + }, + { + "epoch": 0.5935828877005348, + "grad_norm": 1.868335247039795, + "learning_rate": 1.6117455837766752e-05, + "loss": 0.9216, + "step": 333 + }, + { + "epoch": 0.5953654188948306, + "grad_norm": 2.4309539794921875, + "learning_rate": 1.6095082791713322e-05, + "loss": 1.0407, + "step": 334 + }, + { + "epoch": 0.5971479500891266, + "grad_norm": 2.3109192848205566, + "learning_rate": 1.6072661097155732e-05, + "loss": 1.0216, + "step": 335 + }, + { + "epoch": 0.5989304812834224, + "grad_norm": 1.9943416118621826, + "learning_rate": 1.6050190933054937e-05, + "loss": 0.9701, + "step": 336 + }, + { + "epoch": 0.6007130124777184, + "grad_norm": 2.3711092472076416, + "learning_rate": 1.6027672478758776e-05, + "loss": 0.98, + "step": 337 + }, + { + "epoch": 0.6024955436720143, + "grad_norm": 2.296501874923706, + "learning_rate": 1.6005105914000508e-05, + "loss": 0.9921, + "step": 338 + }, + { + "epoch": 0.6042780748663101, + "grad_norm": 2.1762170791625977, + "learning_rate": 1.5982491418897393e-05, + "loss": 0.9573, + "step": 339 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 2.417280673980713, + "learning_rate": 1.5959829173949256e-05, + "loss": 1.0293, + "step": 340 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 1.763693928718567, + "learning_rate": 1.5937119360037035e-05, + "loss": 0.9643, + "step": 341 + }, + { + "epoch": 0.6096256684491979, + "grad_norm": 2.0555365085601807, + "learning_rate": 1.5914362158421352e-05, + "loss": 1.0429, + "step": 342 + }, + { + "epoch": 0.6114081996434938, + "grad_norm": 2.018656015396118, + "learning_rate": 1.5891557750741054e-05, + "loss": 1.0253, + "step": 343 + }, + { + "epoch": 0.6131907308377896, + "grad_norm": 2.1717021465301514, + "learning_rate": 1.586870631901177e-05, + "loss": 0.9776, + "step": 344 + }, + { + "epoch": 0.6149732620320856, + "grad_norm": 2.066257953643799, + "learning_rate": 1.5845808045624456e-05, + "loss": 0.9581, + "step": 345 + }, + { + "epoch": 0.6167557932263814, + "grad_norm": 1.9698491096496582, + "learning_rate": 1.5822863113343934e-05, + "loss": 0.8828, + "step": 346 + }, + { + "epoch": 0.6185383244206774, + "grad_norm": 2.027034282684326, + "learning_rate": 1.5799871705307447e-05, + "loss": 1.041, + "step": 347 + }, + { + "epoch": 0.6203208556149733, + "grad_norm": 2.147871255874634, + "learning_rate": 1.5776834005023184e-05, + "loss": 1.0008, + "step": 348 + }, + { + "epoch": 0.6221033868092691, + "grad_norm": 2.168610095977783, + "learning_rate": 1.5753750196368822e-05, + "loss": 0.9446, + "step": 349 + }, + { + "epoch": 0.6238859180035651, + "grad_norm": 1.9053958654403687, + "learning_rate": 1.5730620463590052e-05, + "loss": 0.9966, + "step": 350 + }, + { + "epoch": 0.6256684491978609, + "grad_norm": 2.295462131500244, + "learning_rate": 1.5707444991299116e-05, + "loss": 0.9764, + "step": 351 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 1.9814562797546387, + "learning_rate": 1.5684223964473338e-05, + "loss": 0.9882, + "step": 352 + }, + { + "epoch": 0.6292335115864528, + "grad_norm": 2.3943731784820557, + "learning_rate": 1.566095756845362e-05, + "loss": 1.0871, + "step": 353 + }, + { + "epoch": 0.6310160427807486, + "grad_norm": 1.8611518144607544, + "learning_rate": 1.5637645988943008e-05, + "loss": 1.0166, + "step": 354 + }, + { + "epoch": 0.6327985739750446, + "grad_norm": 2.168264389038086, + "learning_rate": 1.5614289412005164e-05, + "loss": 0.9882, + "step": 355 + }, + { + "epoch": 0.6345811051693404, + "grad_norm": 1.9876874685287476, + "learning_rate": 1.559088802406292e-05, + "loss": 0.9658, + "step": 356 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 2.154973268508911, + "learning_rate": 1.5567442011896748e-05, + "loss": 0.9549, + "step": 357 + }, + { + "epoch": 0.6381461675579323, + "grad_norm": 1.892314076423645, + "learning_rate": 1.554395156264331e-05, + "loss": 0.9583, + "step": 358 + }, + { + "epoch": 0.6399286987522281, + "grad_norm": 2.019479274749756, + "learning_rate": 1.5520416863793942e-05, + "loss": 1.0063, + "step": 359 + }, + { + "epoch": 0.6417112299465241, + "grad_norm": 2.3134050369262695, + "learning_rate": 1.549683810319317e-05, + "loss": 0.9719, + "step": 360 + }, + { + "epoch": 0.64349376114082, + "grad_norm": 2.2110652923583984, + "learning_rate": 1.5473215469037187e-05, + "loss": 0.9595, + "step": 361 + }, + { + "epoch": 0.6452762923351159, + "grad_norm": 1.9033259153366089, + "learning_rate": 1.544954914987238e-05, + "loss": 0.9726, + "step": 362 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 1.9950685501098633, + "learning_rate": 1.5425839334593803e-05, + "loss": 0.8797, + "step": 363 + }, + { + "epoch": 0.6488413547237076, + "grad_norm": 2.0263285636901855, + "learning_rate": 1.5402086212443694e-05, + "loss": 0.9327, + "step": 364 + }, + { + "epoch": 0.6506238859180036, + "grad_norm": 2.092421531677246, + "learning_rate": 1.5378289973009933e-05, + "loss": 0.9811, + "step": 365 + }, + { + "epoch": 0.6524064171122995, + "grad_norm": 2.227092981338501, + "learning_rate": 1.5354450806224553e-05, + "loss": 0.9588, + "step": 366 + }, + { + "epoch": 0.6541889483065954, + "grad_norm": 2.0348732471466064, + "learning_rate": 1.533056890236221e-05, + "loss": 0.9289, + "step": 367 + }, + { + "epoch": 0.6559714795008913, + "grad_norm": 2.1439108848571777, + "learning_rate": 1.5306644452038682e-05, + "loss": 1.0311, + "step": 368 + }, + { + "epoch": 0.6577540106951871, + "grad_norm": 2.0412769317626953, + "learning_rate": 1.528267764620932e-05, + "loss": 1.0458, + "step": 369 + }, + { + "epoch": 0.6595365418894831, + "grad_norm": 1.9768530130386353, + "learning_rate": 1.5258668676167548e-05, + "loss": 1.0051, + "step": 370 + }, + { + "epoch": 0.661319073083779, + "grad_norm": 1.908537745475769, + "learning_rate": 1.5234617733543334e-05, + "loss": 0.9829, + "step": 371 + }, + { + "epoch": 0.6631016042780749, + "grad_norm": 2.1109814643859863, + "learning_rate": 1.5210525010301638e-05, + "loss": 0.8928, + "step": 372 + }, + { + "epoch": 0.6648841354723708, + "grad_norm": 2.2573094367980957, + "learning_rate": 1.5186390698740909e-05, + "loss": 0.9743, + "step": 373 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.938982367515564, + "learning_rate": 1.516221499149154e-05, + "loss": 0.9388, + "step": 374 + }, + { + "epoch": 0.6684491978609626, + "grad_norm": 2.141972780227661, + "learning_rate": 1.5137998081514313e-05, + "loss": 0.972, + "step": 375 + }, + { + "epoch": 0.6702317290552585, + "grad_norm": 2.0705463886260986, + "learning_rate": 1.5113740162098887e-05, + "loss": 0.9987, + "step": 376 + }, + { + "epoch": 0.6720142602495544, + "grad_norm": 2.0516183376312256, + "learning_rate": 1.5089441426862239e-05, + "loss": 0.9494, + "step": 377 + }, + { + "epoch": 0.6737967914438503, + "grad_norm": 1.9151005744934082, + "learning_rate": 1.5065102069747117e-05, + "loss": 0.9614, + "step": 378 + }, + { + "epoch": 0.6755793226381461, + "grad_norm": 2.014390468597412, + "learning_rate": 1.5040722285020497e-05, + "loss": 0.9896, + "step": 379 + }, + { + "epoch": 0.6773618538324421, + "grad_norm": 2.075321912765503, + "learning_rate": 1.501630226727204e-05, + "loss": 0.9529, + "step": 380 + }, + { + "epoch": 0.679144385026738, + "grad_norm": 1.912244200706482, + "learning_rate": 1.499184221141252e-05, + "loss": 0.9753, + "step": 381 + }, + { + "epoch": 0.6809269162210339, + "grad_norm": 1.9480012655258179, + "learning_rate": 1.4967342312672283e-05, + "loss": 0.9064, + "step": 382 + }, + { + "epoch": 0.6827094474153298, + "grad_norm": 2.0167205333709717, + "learning_rate": 1.4942802766599694e-05, + "loss": 0.898, + "step": 383 + }, + { + "epoch": 0.6844919786096256, + "grad_norm": 2.0487241744995117, + "learning_rate": 1.491822376905955e-05, + "loss": 0.9681, + "step": 384 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 2.0675833225250244, + "learning_rate": 1.489360551623155e-05, + "loss": 1.0658, + "step": 385 + }, + { + "epoch": 0.6880570409982175, + "grad_norm": 2.2603049278259277, + "learning_rate": 1.48689482046087e-05, + "loss": 1.0167, + "step": 386 + }, + { + "epoch": 0.6898395721925134, + "grad_norm": 2.2933084964752197, + "learning_rate": 1.4844252030995768e-05, + "loss": 0.9918, + "step": 387 + }, + { + "epoch": 0.6916221033868093, + "grad_norm": 2.080951690673828, + "learning_rate": 1.4819517192507698e-05, + "loss": 0.9779, + "step": 388 + }, + { + "epoch": 0.6934046345811051, + "grad_norm": 2.087494134902954, + "learning_rate": 1.4794743886568034e-05, + "loss": 1.0108, + "step": 389 + }, + { + "epoch": 0.6951871657754011, + "grad_norm": 2.1779730319976807, + "learning_rate": 1.4769932310907372e-05, + "loss": 0.9769, + "step": 390 + }, + { + "epoch": 0.696969696969697, + "grad_norm": 2.1749205589294434, + "learning_rate": 1.4745082663561741e-05, + "loss": 0.9973, + "step": 391 + }, + { + "epoch": 0.6987522281639929, + "grad_norm": 2.1819114685058594, + "learning_rate": 1.4720195142871054e-05, + "loss": 0.9246, + "step": 392 + }, + { + "epoch": 0.7005347593582888, + "grad_norm": 2.2912583351135254, + "learning_rate": 1.4695269947477506e-05, + "loss": 1.0427, + "step": 393 + }, + { + "epoch": 0.7023172905525846, + "grad_norm": 2.0110158920288086, + "learning_rate": 1.467030727632401e-05, + "loss": 0.9688, + "step": 394 + }, + { + "epoch": 0.7040998217468806, + "grad_norm": 2.1311464309692383, + "learning_rate": 1.4645307328652578e-05, + "loss": 0.9511, + "step": 395 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 2.2430477142333984, + "learning_rate": 1.4620270304002762e-05, + "loss": 1.0001, + "step": 396 + }, + { + "epoch": 0.7076648841354723, + "grad_norm": 2.282311201095581, + "learning_rate": 1.459519640221004e-05, + "loss": 1.0278, + "step": 397 + }, + { + "epoch": 0.7094474153297683, + "grad_norm": 2.0738134384155273, + "learning_rate": 1.4570085823404232e-05, + "loss": 0.9887, + "step": 398 + }, + { + "epoch": 0.7112299465240641, + "grad_norm": 2.2161929607391357, + "learning_rate": 1.45449387680079e-05, + "loss": 0.954, + "step": 399 + }, + { + "epoch": 0.7130124777183601, + "grad_norm": 1.9803860187530518, + "learning_rate": 1.4519755436734744e-05, + "loss": 0.9361, + "step": 400 + }, + { + "epoch": 0.714795008912656, + "grad_norm": 2.0878264904022217, + "learning_rate": 1.4494536030588003e-05, + "loss": 0.9482, + "step": 401 + }, + { + "epoch": 0.7165775401069518, + "grad_norm": 2.086885690689087, + "learning_rate": 1.4469280750858854e-05, + "loss": 0.9556, + "step": 402 + }, + { + "epoch": 0.7183600713012478, + "grad_norm": 1.8085094690322876, + "learning_rate": 1.4443989799124807e-05, + "loss": 0.9211, + "step": 403 + }, + { + "epoch": 0.7201426024955436, + "grad_norm": 1.8291845321655273, + "learning_rate": 1.4418663377248078e-05, + "loss": 1.0208, + "step": 404 + }, + { + "epoch": 0.7219251336898396, + "grad_norm": 2.2089219093322754, + "learning_rate": 1.4393301687374009e-05, + "loss": 0.9355, + "step": 405 + }, + { + "epoch": 0.7237076648841355, + "grad_norm": 1.7623047828674316, + "learning_rate": 1.4367904931929422e-05, + "loss": 0.9629, + "step": 406 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 2.1797492504119873, + "learning_rate": 1.4342473313621026e-05, + "loss": 0.9792, + "step": 407 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 1.8126072883605957, + "learning_rate": 1.4317007035433788e-05, + "loss": 0.9567, + "step": 408 + }, + { + "epoch": 0.7290552584670231, + "grad_norm": 2.481426239013672, + "learning_rate": 1.4291506300629322e-05, + "loss": 1.0067, + "step": 409 + }, + { + "epoch": 0.7308377896613191, + "grad_norm": 2.2379016876220703, + "learning_rate": 1.4265971312744252e-05, + "loss": 0.9958, + "step": 410 + }, + { + "epoch": 0.732620320855615, + "grad_norm": 1.8787038326263428, + "learning_rate": 1.4240402275588602e-05, + "loss": 0.9115, + "step": 411 + }, + { + "epoch": 0.7344028520499108, + "grad_norm": 1.802093267440796, + "learning_rate": 1.4214799393244167e-05, + "loss": 0.9095, + "step": 412 + }, + { + "epoch": 0.7361853832442068, + "grad_norm": 2.0813779830932617, + "learning_rate": 1.4189162870062869e-05, + "loss": 0.8838, + "step": 413 + }, + { + "epoch": 0.7379679144385026, + "grad_norm": 1.891223669052124, + "learning_rate": 1.4163492910665153e-05, + "loss": 0.8615, + "step": 414 + }, + { + "epoch": 0.7397504456327986, + "grad_norm": 1.8859976530075073, + "learning_rate": 1.4137789719938324e-05, + "loss": 0.9456, + "step": 415 + }, + { + "epoch": 0.7415329768270945, + "grad_norm": 1.8638348579406738, + "learning_rate": 1.4112053503034937e-05, + "loss": 0.8893, + "step": 416 + }, + { + "epoch": 0.7433155080213903, + "grad_norm": 2.4459497928619385, + "learning_rate": 1.4086284465371144e-05, + "loss": 0.9707, + "step": 417 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 2.1890289783477783, + "learning_rate": 1.4060482812625055e-05, + "loss": 1.0272, + "step": 418 + }, + { + "epoch": 0.7468805704099821, + "grad_norm": 2.002814769744873, + "learning_rate": 1.4034648750735109e-05, + "loss": 0.9049, + "step": 419 + }, + { + "epoch": 0.7486631016042781, + "grad_norm": 1.784708023071289, + "learning_rate": 1.4008782485898419e-05, + "loss": 0.8982, + "step": 420 + }, + { + "epoch": 0.750445632798574, + "grad_norm": 2.3620011806488037, + "learning_rate": 1.3982884224569121e-05, + "loss": 0.9832, + "step": 421 + }, + { + "epoch": 0.7522281639928698, + "grad_norm": 1.8406814336776733, + "learning_rate": 1.395695417345675e-05, + "loss": 0.9249, + "step": 422 + }, + { + "epoch": 0.7540106951871658, + "grad_norm": 2.3838629722595215, + "learning_rate": 1.393099253952456e-05, + "loss": 1.0131, + "step": 423 + }, + { + "epoch": 0.7540106951871658, + "eval_loss": 0.9591041803359985, + "eval_runtime": 329.7127, + "eval_samples_per_second": 8.923, + "eval_steps_per_second": 1.116, + "step": 423 + }, + { + "epoch": 0.7557932263814616, + "grad_norm": 1.7104860544204712, + "learning_rate": 1.390499952998789e-05, + "loss": 0.9101, + "step": 424 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 1.9570362567901611, + "learning_rate": 1.3878975352312511e-05, + "loss": 0.9648, + "step": 425 + }, + { + "epoch": 0.7593582887700535, + "grad_norm": 2.0794196128845215, + "learning_rate": 1.3852920214212966e-05, + "loss": 0.9525, + "step": 426 + }, + { + "epoch": 0.7611408199643493, + "grad_norm": 1.821881890296936, + "learning_rate": 1.3826834323650899e-05, + "loss": 1.0086, + "step": 427 + }, + { + "epoch": 0.7629233511586453, + "grad_norm": 2.062831401824951, + "learning_rate": 1.3800717888833423e-05, + "loss": 0.8474, + "step": 428 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 2.1243581771850586, + "learning_rate": 1.3774571118211441e-05, + "loss": 0.9913, + "step": 429 + }, + { + "epoch": 0.7664884135472371, + "grad_norm": 1.9317386150360107, + "learning_rate": 1.3748394220477972e-05, + "loss": 0.9431, + "step": 430 + }, + { + "epoch": 0.768270944741533, + "grad_norm": 2.076014518737793, + "learning_rate": 1.3722187404566508e-05, + "loss": 0.9975, + "step": 431 + }, + { + "epoch": 0.7700534759358288, + "grad_norm": 2.021941900253296, + "learning_rate": 1.3695950879649337e-05, + "loss": 1.0482, + "step": 432 + }, + { + "epoch": 0.7718360071301248, + "grad_norm": 1.8548147678375244, + "learning_rate": 1.3669684855135868e-05, + "loss": 0.9462, + "step": 433 + }, + { + "epoch": 0.7736185383244206, + "grad_norm": 1.9138809442520142, + "learning_rate": 1.3643389540670963e-05, + "loss": 0.9553, + "step": 434 + }, + { + "epoch": 0.7754010695187166, + "grad_norm": 2.0975797176361084, + "learning_rate": 1.361706514613327e-05, + "loss": 1.0501, + "step": 435 + }, + { + "epoch": 0.7771836007130125, + "grad_norm": 1.9075837135314941, + "learning_rate": 1.3590711881633535e-05, + "loss": 0.9563, + "step": 436 + }, + { + "epoch": 0.7789661319073083, + "grad_norm": 2.0391204357147217, + "learning_rate": 1.3564329957512941e-05, + "loss": 0.9746, + "step": 437 + }, + { + "epoch": 0.7807486631016043, + "grad_norm": 1.8339051008224487, + "learning_rate": 1.3537919584341413e-05, + "loss": 0.9903, + "step": 438 + }, + { + "epoch": 0.7825311942959001, + "grad_norm": 2.02894926071167, + "learning_rate": 1.3511480972915946e-05, + "loss": 1.0049, + "step": 439 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 1.8192859888076782, + "learning_rate": 1.348501433425893e-05, + "loss": 0.9277, + "step": 440 + }, + { + "epoch": 0.786096256684492, + "grad_norm": 2.4397940635681152, + "learning_rate": 1.3458519879616447e-05, + "loss": 1.0003, + "step": 441 + }, + { + "epoch": 0.7878787878787878, + "grad_norm": 1.9915246963500977, + "learning_rate": 1.3431997820456592e-05, + "loss": 1.0435, + "step": 442 + }, + { + "epoch": 0.7896613190730838, + "grad_norm": 1.7727062702178955, + "learning_rate": 1.3405448368467808e-05, + "loss": 0.9235, + "step": 443 + }, + { + "epoch": 0.7914438502673797, + "grad_norm": 2.1558029651641846, + "learning_rate": 1.3378871735557156e-05, + "loss": 1.047, + "step": 444 + }, + { + "epoch": 0.7932263814616756, + "grad_norm": 2.189209222793579, + "learning_rate": 1.335226813384865e-05, + "loss": 1.013, + "step": 445 + }, + { + "epoch": 0.7950089126559715, + "grad_norm": 2.119126319885254, + "learning_rate": 1.3325637775681561e-05, + "loss": 0.9492, + "step": 446 + }, + { + "epoch": 0.7967914438502673, + "grad_norm": 1.9351035356521606, + "learning_rate": 1.329898087360872e-05, + "loss": 0.97, + "step": 447 + }, + { + "epoch": 0.7985739750445633, + "grad_norm": 1.7888542413711548, + "learning_rate": 1.3272297640394818e-05, + "loss": 0.9156, + "step": 448 + }, + { + "epoch": 0.8003565062388592, + "grad_norm": 2.022596836090088, + "learning_rate": 1.3245588289014712e-05, + "loss": 0.9857, + "step": 449 + }, + { + "epoch": 0.8021390374331551, + "grad_norm": 1.9733895063400269, + "learning_rate": 1.3218853032651719e-05, + "loss": 0.9402, + "step": 450 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 2.3745057582855225, + "learning_rate": 1.3192092084695927e-05, + "loss": 0.9005, + "step": 451 + }, + { + "epoch": 0.8057040998217468, + "grad_norm": 2.057950496673584, + "learning_rate": 1.316530565874248e-05, + "loss": 0.933, + "step": 452 + }, + { + "epoch": 0.8074866310160428, + "grad_norm": 2.111551284790039, + "learning_rate": 1.3138493968589875e-05, + "loss": 0.9787, + "step": 453 + }, + { + "epoch": 0.8092691622103387, + "grad_norm": 2.0102903842926025, + "learning_rate": 1.3111657228238263e-05, + "loss": 0.9461, + "step": 454 + }, + { + "epoch": 0.8110516934046346, + "grad_norm": 1.9631766080856323, + "learning_rate": 1.3084795651887734e-05, + "loss": 0.9222, + "step": 455 + }, + { + "epoch": 0.8128342245989305, + "grad_norm": 2.0150415897369385, + "learning_rate": 1.3057909453936604e-05, + "loss": 0.9728, + "step": 456 + }, + { + "epoch": 0.8146167557932263, + "grad_norm": 2.039376735687256, + "learning_rate": 1.3030998848979714e-05, + "loss": 0.9848, + "step": 457 + }, + { + "epoch": 0.8163992869875223, + "grad_norm": 1.98569917678833, + "learning_rate": 1.3004064051806712e-05, + "loss": 0.9544, + "step": 458 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 1.67859947681427, + "learning_rate": 1.2977105277400331e-05, + "loss": 0.9606, + "step": 459 + }, + { + "epoch": 0.8199643493761141, + "grad_norm": 1.8734769821166992, + "learning_rate": 1.2950122740934691e-05, + "loss": 0.9627, + "step": 460 + }, + { + "epoch": 0.82174688057041, + "grad_norm": 2.455686330795288, + "learning_rate": 1.2923116657773571e-05, + "loss": 1.0661, + "step": 461 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 1.980596899986267, + "learning_rate": 1.2896087243468673e-05, + "loss": 0.9292, + "step": 462 + }, + { + "epoch": 0.8253119429590018, + "grad_norm": 2.062181234359741, + "learning_rate": 1.2869034713757949e-05, + "loss": 0.9011, + "step": 463 + }, + { + "epoch": 0.8270944741532977, + "grad_norm": 1.7144403457641602, + "learning_rate": 1.2841959284563818e-05, + "loss": 0.9019, + "step": 464 + }, + { + "epoch": 0.8288770053475936, + "grad_norm": 2.134111166000366, + "learning_rate": 1.2814861171991495e-05, + "loss": 0.9295, + "step": 465 + }, + { + "epoch": 0.8306595365418895, + "grad_norm": 2.0311472415924072, + "learning_rate": 1.2787740592327232e-05, + "loss": 0.8818, + "step": 466 + }, + { + "epoch": 0.8324420677361853, + "grad_norm": 1.967252492904663, + "learning_rate": 1.2760597762036614e-05, + "loss": 0.8939, + "step": 467 + }, + { + "epoch": 0.8342245989304813, + "grad_norm": 1.8478994369506836, + "learning_rate": 1.2733432897762814e-05, + "loss": 0.9583, + "step": 468 + }, + { + "epoch": 0.8360071301247772, + "grad_norm": 1.8894236087799072, + "learning_rate": 1.270624621632487e-05, + "loss": 0.8987, + "step": 469 + }, + { + "epoch": 0.8377896613190731, + "grad_norm": 2.0267107486724854, + "learning_rate": 1.267903793471597e-05, + "loss": 0.8838, + "step": 470 + }, + { + "epoch": 0.839572192513369, + "grad_norm": 2.0982789993286133, + "learning_rate": 1.2651808270101688e-05, + "loss": 0.9397, + "step": 471 + }, + { + "epoch": 0.8413547237076648, + "grad_norm": 2.369551420211792, + "learning_rate": 1.2624557439818277e-05, + "loss": 0.9967, + "step": 472 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 2.0669097900390625, + "learning_rate": 1.2597285661370928e-05, + "loss": 0.9919, + "step": 473 + }, + { + "epoch": 0.8449197860962567, + "grad_norm": 1.8921197652816772, + "learning_rate": 1.2569993152432028e-05, + "loss": 0.8554, + "step": 474 + }, + { + "epoch": 0.8467023172905526, + "grad_norm": 2.012568235397339, + "learning_rate": 1.254268013083943e-05, + "loss": 0.8641, + "step": 475 + }, + { + "epoch": 0.8484848484848485, + "grad_norm": 2.1181392669677734, + "learning_rate": 1.2515346814594704e-05, + "loss": 0.9415, + "step": 476 + }, + { + "epoch": 0.8502673796791443, + "grad_norm": 2.0450236797332764, + "learning_rate": 1.248799342186141e-05, + "loss": 0.8715, + "step": 477 + }, + { + "epoch": 0.8520499108734403, + "grad_norm": 2.0089316368103027, + "learning_rate": 1.2460620170963353e-05, + "loss": 0.9163, + "step": 478 + }, + { + "epoch": 0.8538324420677362, + "grad_norm": 2.2603275775909424, + "learning_rate": 1.2433227280382827e-05, + "loss": 0.8885, + "step": 479 + }, + { + "epoch": 0.8556149732620321, + "grad_norm": 2.2340140342712402, + "learning_rate": 1.2405814968758889e-05, + "loss": 0.9835, + "step": 480 + }, + { + "epoch": 0.857397504456328, + "grad_norm": 1.7993637323379517, + "learning_rate": 1.2378383454885614e-05, + "loss": 0.9201, + "step": 481 + }, + { + "epoch": 0.8591800356506238, + "grad_norm": 1.803969383239746, + "learning_rate": 1.2350932957710322e-05, + "loss": 0.9015, + "step": 482 + }, + { + "epoch": 0.8609625668449198, + "grad_norm": 2.1099390983581543, + "learning_rate": 1.2323463696331873e-05, + "loss": 0.9213, + "step": 483 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 2.0957653522491455, + "learning_rate": 1.2295975889998882e-05, + "loss": 0.9323, + "step": 484 + }, + { + "epoch": 0.8645276292335116, + "grad_norm": 1.8833292722702026, + "learning_rate": 1.226846975810798e-05, + "loss": 0.997, + "step": 485 + }, + { + "epoch": 0.8663101604278075, + "grad_norm": 1.7665544748306274, + "learning_rate": 1.2240945520202079e-05, + "loss": 0.9256, + "step": 486 + }, + { + "epoch": 0.8680926916221033, + "grad_norm": 1.7884703874588013, + "learning_rate": 1.2213403395968593e-05, + "loss": 0.9072, + "step": 487 + }, + { + "epoch": 0.8698752228163993, + "grad_norm": 1.8856773376464844, + "learning_rate": 1.2185843605237698e-05, + "loss": 0.9515, + "step": 488 + }, + { + "epoch": 0.8716577540106952, + "grad_norm": 1.8611061573028564, + "learning_rate": 1.2158266367980584e-05, + "loss": 0.8748, + "step": 489 + }, + { + "epoch": 0.8734402852049911, + "grad_norm": 1.7856934070587158, + "learning_rate": 1.2130671904307692e-05, + "loss": 0.838, + "step": 490 + }, + { + "epoch": 0.875222816399287, + "grad_norm": 1.7748630046844482, + "learning_rate": 1.2103060434466946e-05, + "loss": 0.9826, + "step": 491 + }, + { + "epoch": 0.8770053475935828, + "grad_norm": 2.0026159286499023, + "learning_rate": 1.2075432178842022e-05, + "loss": 0.9543, + "step": 492 + }, + { + "epoch": 0.8787878787878788, + "grad_norm": 2.055417537689209, + "learning_rate": 1.2047787357950561e-05, + "loss": 0.9127, + "step": 493 + }, + { + "epoch": 0.8805704099821747, + "grad_norm": 2.038173198699951, + "learning_rate": 1.202012619244243e-05, + "loss": 0.9169, + "step": 494 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 1.8826080560684204, + "learning_rate": 1.1992448903097948e-05, + "loss": 0.8814, + "step": 495 + }, + { + "epoch": 0.8841354723707665, + "grad_norm": 1.911270260810852, + "learning_rate": 1.1964755710826125e-05, + "loss": 0.8788, + "step": 496 + }, + { + "epoch": 0.8859180035650623, + "grad_norm": 2.083873987197876, + "learning_rate": 1.1937046836662906e-05, + "loss": 0.9855, + "step": 497 + }, + { + "epoch": 0.8877005347593583, + "grad_norm": 1.874510645866394, + "learning_rate": 1.1909322501769407e-05, + "loss": 0.841, + "step": 498 + }, + { + "epoch": 0.8894830659536542, + "grad_norm": 2.250204563140869, + "learning_rate": 1.1881582927430135e-05, + "loss": 0.968, + "step": 499 + }, + { + "epoch": 0.8912655971479501, + "grad_norm": 1.907017707824707, + "learning_rate": 1.1853828335051236e-05, + "loss": 0.8956, + "step": 500 + }, + { + "epoch": 0.893048128342246, + "grad_norm": 2.0900657176971436, + "learning_rate": 1.182605894615873e-05, + "loss": 0.9568, + "step": 501 + }, + { + "epoch": 0.8948306595365418, + "grad_norm": 1.7974416017532349, + "learning_rate": 1.1798274982396728e-05, + "loss": 0.9507, + "step": 502 + }, + { + "epoch": 0.8966131907308378, + "grad_norm": 1.8360202312469482, + "learning_rate": 1.1770476665525673e-05, + "loss": 0.9114, + "step": 503 + }, + { + "epoch": 0.8983957219251337, + "grad_norm": 2.1357533931732178, + "learning_rate": 1.1742664217420584e-05, + "loss": 0.965, + "step": 504 + }, + { + "epoch": 0.9001782531194296, + "grad_norm": 2.0421371459960938, + "learning_rate": 1.1714837860069243e-05, + "loss": 0.9259, + "step": 505 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 1.7875169515609741, + "learning_rate": 1.1686997815570473e-05, + "loss": 0.9225, + "step": 506 + }, + { + "epoch": 0.9037433155080213, + "grad_norm": 1.9849542379379272, + "learning_rate": 1.1659144306132332e-05, + "loss": 0.9419, + "step": 507 + }, + { + "epoch": 0.9055258467023173, + "grad_norm": 2.0346877574920654, + "learning_rate": 1.163127755407035e-05, + "loss": 0.9155, + "step": 508 + }, + { + "epoch": 0.9073083778966132, + "grad_norm": 1.891310453414917, + "learning_rate": 1.1603397781805754e-05, + "loss": 0.8802, + "step": 509 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.672937273979187, + "learning_rate": 1.15755052118637e-05, + "loss": 0.8794, + "step": 510 + }, + { + "epoch": 0.910873440285205, + "grad_norm": 2.178529977798462, + "learning_rate": 1.154760006687148e-05, + "loss": 0.9578, + "step": 511 + }, + { + "epoch": 0.9126559714795008, + "grad_norm": 2.18163800239563, + "learning_rate": 1.1519682569556758e-05, + "loss": 0.9322, + "step": 512 + }, + { + "epoch": 0.9144385026737968, + "grad_norm": 1.9352266788482666, + "learning_rate": 1.14917529427458e-05, + "loss": 0.8904, + "step": 513 + }, + { + "epoch": 0.9162210338680927, + "grad_norm": 2.227802038192749, + "learning_rate": 1.1463811409361667e-05, + "loss": 1.0417, + "step": 514 + }, + { + "epoch": 0.9180035650623886, + "grad_norm": 1.9616986513137817, + "learning_rate": 1.1435858192422464e-05, + "loss": 0.8749, + "step": 515 + }, + { + "epoch": 0.9197860962566845, + "grad_norm": 2.0140492916107178, + "learning_rate": 1.1407893515039555e-05, + "loss": 0.9186, + "step": 516 + }, + { + "epoch": 0.9215686274509803, + "grad_norm": 1.7624683380126953, + "learning_rate": 1.1379917600415758e-05, + "loss": 0.9289, + "step": 517 + }, + { + "epoch": 0.9233511586452763, + "grad_norm": 1.8611254692077637, + "learning_rate": 1.13519306718436e-05, + "loss": 0.924, + "step": 518 + }, + { + "epoch": 0.9251336898395722, + "grad_norm": 1.6258656978607178, + "learning_rate": 1.1323932952703512e-05, + "loss": 0.9176, + "step": 519 + }, + { + "epoch": 0.9269162210338681, + "grad_norm": 2.0411124229431152, + "learning_rate": 1.1295924666462042e-05, + "loss": 0.9044, + "step": 520 + }, + { + "epoch": 0.928698752228164, + "grad_norm": 2.3665778636932373, + "learning_rate": 1.1267906036670093e-05, + "loss": 1.0121, + "step": 521 + }, + { + "epoch": 0.93048128342246, + "grad_norm": 2.205308675765991, + "learning_rate": 1.1239877286961123e-05, + "loss": 0.9616, + "step": 522 + }, + { + "epoch": 0.9322638146167558, + "grad_norm": 1.706140398979187, + "learning_rate": 1.1211838641049355e-05, + "loss": 0.9018, + "step": 523 + }, + { + "epoch": 0.9340463458110517, + "grad_norm": 1.945457935333252, + "learning_rate": 1.1183790322728012e-05, + "loss": 1.0168, + "step": 524 + }, + { + "epoch": 0.9358288770053476, + "grad_norm": 1.9941941499710083, + "learning_rate": 1.1155732555867504e-05, + "loss": 0.9059, + "step": 525 + }, + { + "epoch": 0.9376114081996435, + "grad_norm": 2.1140549182891846, + "learning_rate": 1.112766556441367e-05, + "loss": 0.9084, + "step": 526 + }, + { + "epoch": 0.9393939393939394, + "grad_norm": 1.8418564796447754, + "learning_rate": 1.1099589572385968e-05, + "loss": 0.9572, + "step": 527 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 1.7444286346435547, + "learning_rate": 1.1071504803875692e-05, + "loss": 0.9027, + "step": 528 + }, + { + "epoch": 0.9429590017825312, + "grad_norm": 2.1305134296417236, + "learning_rate": 1.1043411483044193e-05, + "loss": 0.9695, + "step": 529 + }, + { + "epoch": 0.9447415329768271, + "grad_norm": 1.7305046319961548, + "learning_rate": 1.1015309834121083e-05, + "loss": 0.8617, + "step": 530 + }, + { + "epoch": 0.946524064171123, + "grad_norm": 1.9054243564605713, + "learning_rate": 1.098720008140244e-05, + "loss": 0.8649, + "step": 531 + }, + { + "epoch": 0.948306595365419, + "grad_norm": 1.9462075233459473, + "learning_rate": 1.0959082449249025e-05, + "loss": 0.9458, + "step": 532 + }, + { + "epoch": 0.9500891265597148, + "grad_norm": 2.21990966796875, + "learning_rate": 1.0930957162084496e-05, + "loss": 0.9427, + "step": 533 + }, + { + "epoch": 0.9518716577540107, + "grad_norm": 1.8306422233581543, + "learning_rate": 1.0902824444393602e-05, + "loss": 0.9289, + "step": 534 + }, + { + "epoch": 0.9536541889483066, + "grad_norm": 1.9472652673721313, + "learning_rate": 1.0874684520720405e-05, + "loss": 0.937, + "step": 535 + }, + { + "epoch": 0.9554367201426025, + "grad_norm": 1.8183170557022095, + "learning_rate": 1.0846537615666477e-05, + "loss": 0.9081, + "step": 536 + }, + { + "epoch": 0.9572192513368984, + "grad_norm": 1.9509567022323608, + "learning_rate": 1.0818383953889118e-05, + "loss": 0.9423, + "step": 537 + }, + { + "epoch": 0.9590017825311943, + "grad_norm": 1.9274513721466064, + "learning_rate": 1.079022376009955e-05, + "loss": 0.9819, + "step": 538 + }, + { + "epoch": 0.9607843137254902, + "grad_norm": 2.0134758949279785, + "learning_rate": 1.0762057259061143e-05, + "loss": 0.9518, + "step": 539 + }, + { + "epoch": 0.9625668449197861, + "grad_norm": 1.5931446552276611, + "learning_rate": 1.0733884675587595e-05, + "loss": 0.9256, + "step": 540 + }, + { + "epoch": 0.964349376114082, + "grad_norm": 1.8069127798080444, + "learning_rate": 1.0705706234541163e-05, + "loss": 0.9294, + "step": 541 + }, + { + "epoch": 0.966131907308378, + "grad_norm": 2.059453010559082, + "learning_rate": 1.067752216083085e-05, + "loss": 0.9317, + "step": 542 + }, + { + "epoch": 0.9679144385026738, + "grad_norm": 1.7917944192886353, + "learning_rate": 1.0649332679410615e-05, + "loss": 0.8465, + "step": 543 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 1.9388025999069214, + "learning_rate": 1.062113801527759e-05, + "loss": 0.8946, + "step": 544 + }, + { + "epoch": 0.9714795008912656, + "grad_norm": 1.9177924394607544, + "learning_rate": 1.0592938393470267e-05, + "loss": 0.9512, + "step": 545 + }, + { + "epoch": 0.9732620320855615, + "grad_norm": 2.0775253772735596, + "learning_rate": 1.05647340390667e-05, + "loss": 0.9381, + "step": 546 + }, + { + "epoch": 0.9750445632798574, + "grad_norm": 2.1442859172821045, + "learning_rate": 1.0536525177182728e-05, + "loss": 0.8385, + "step": 547 + }, + { + "epoch": 0.9768270944741533, + "grad_norm": 2.0453238487243652, + "learning_rate": 1.0508312032970165e-05, + "loss": 0.8533, + "step": 548 + }, + { + "epoch": 0.9786096256684492, + "grad_norm": 2.2256951332092285, + "learning_rate": 1.0480094831614998e-05, + "loss": 0.9253, + "step": 549 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 1.551547884941101, + "learning_rate": 1.0451873798335605e-05, + "loss": 0.8915, + "step": 550 + }, + { + "epoch": 0.982174688057041, + "grad_norm": 1.9802758693695068, + "learning_rate": 1.0423649158380947e-05, + "loss": 0.9275, + "step": 551 + }, + { + "epoch": 0.983957219251337, + "grad_norm": 2.0591816902160645, + "learning_rate": 1.0395421137028761e-05, + "loss": 0.929, + "step": 552 + }, + { + "epoch": 0.9857397504456328, + "grad_norm": 2.0869827270507812, + "learning_rate": 1.0367189959583791e-05, + "loss": 0.9333, + "step": 553 + }, + { + "epoch": 0.9875222816399287, + "grad_norm": 1.6645945310592651, + "learning_rate": 1.0338955851375962e-05, + "loss": 0.8624, + "step": 554 + }, + { + "epoch": 0.9893048128342246, + "grad_norm": 1.5197809934616089, + "learning_rate": 1.031071903775859e-05, + "loss": 0.8751, + "step": 555 + }, + { + "epoch": 0.9910873440285205, + "grad_norm": 1.6992182731628418, + "learning_rate": 1.0282479744106589e-05, + "loss": 0.8991, + "step": 556 + }, + { + "epoch": 0.9928698752228164, + "grad_norm": 1.7648831605911255, + "learning_rate": 1.0254238195814659e-05, + "loss": 0.9318, + "step": 557 + }, + { + "epoch": 0.9946524064171123, + "grad_norm": 1.7822153568267822, + "learning_rate": 1.0225994618295507e-05, + "loss": 1.0017, + "step": 558 + }, + { + "epoch": 0.9964349376114082, + "grad_norm": 1.8108640909194946, + "learning_rate": 1.0197749236978034e-05, + "loss": 0.8772, + "step": 559 + }, + { + "epoch": 0.9982174688057041, + "grad_norm": 1.8217051029205322, + "learning_rate": 1.0169502277305528e-05, + "loss": 0.9624, + "step": 560 + }, + { + "epoch": 1.0, + "grad_norm": 2.1218109130859375, + "learning_rate": 1.0141253964733886e-05, + "loss": 0.9463, + "step": 561 + }, + { + "epoch": 1.0017825311942958, + "grad_norm": 2.0523557662963867, + "learning_rate": 1.01130045247298e-05, + "loss": 0.9625, + "step": 562 + }, + { + "epoch": 1.0035650623885919, + "grad_norm": 1.754342794418335, + "learning_rate": 1.0084754182768959e-05, + "loss": 0.9668, + "step": 563 + }, + { + "epoch": 1.0053475935828877, + "grad_norm": 1.6300525665283203, + "learning_rate": 1.0056503164334252e-05, + "loss": 0.8519, + "step": 564 + }, + { + "epoch": 1.0053475935828877, + "eval_loss": 0.9207794070243835, + "eval_runtime": 329.8091, + "eval_samples_per_second": 8.92, + "eval_steps_per_second": 1.116, + "step": 564 + }, + { + "epoch": 1.0071301247771836, + "grad_norm": 1.831165075302124, + "learning_rate": 1.0028251694913971e-05, + "loss": 0.9349, + "step": 565 + }, + { + "epoch": 1.0089126559714796, + "grad_norm": 1.8127673864364624, + "learning_rate": 1e-05, + "loss": 0.9269, + "step": 566 + }, + { + "epoch": 1.0106951871657754, + "grad_norm": 2.10707426071167, + "learning_rate": 9.97174830508603e-06, + "loss": 1.0066, + "step": 567 + }, + { + "epoch": 1.0124777183600713, + "grad_norm": 1.9743505716323853, + "learning_rate": 9.943496835665751e-06, + "loss": 0.88, + "step": 568 + }, + { + "epoch": 1.014260249554367, + "grad_norm": 2.391684055328369, + "learning_rate": 9.915245817231044e-06, + "loss": 0.89, + "step": 569 + }, + { + "epoch": 1.0160427807486632, + "grad_norm": 2.0105912685394287, + "learning_rate": 9.886995475270205e-06, + "loss": 0.8331, + "step": 570 + }, + { + "epoch": 1.017825311942959, + "grad_norm": 1.8959972858428955, + "learning_rate": 9.85874603526612e-06, + "loss": 0.9467, + "step": 571 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 1.8537402153015137, + "learning_rate": 9.830497722694477e-06, + "loss": 0.902, + "step": 572 + }, + { + "epoch": 1.0213903743315509, + "grad_norm": 1.906813383102417, + "learning_rate": 9.802250763021972e-06, + "loss": 0.9137, + "step": 573 + }, + { + "epoch": 1.0231729055258467, + "grad_norm": 1.8584880828857422, + "learning_rate": 9.774005381704498e-06, + "loss": 0.9084, + "step": 574 + }, + { + "epoch": 1.0249554367201426, + "grad_norm": 1.7670100927352905, + "learning_rate": 9.745761804185346e-06, + "loss": 0.9244, + "step": 575 + }, + { + "epoch": 1.0267379679144386, + "grad_norm": 1.6661604642868042, + "learning_rate": 9.717520255893415e-06, + "loss": 0.8834, + "step": 576 + }, + { + "epoch": 1.0285204991087344, + "grad_norm": 1.8599357604980469, + "learning_rate": 9.689280962241411e-06, + "loss": 0.9902, + "step": 577 + }, + { + "epoch": 1.0303030303030303, + "grad_norm": 1.8606723546981812, + "learning_rate": 9.661044148624038e-06, + "loss": 0.935, + "step": 578 + }, + { + "epoch": 1.0017825311942958, + "grad_norm": 1.840346336364746, + "learning_rate": 9.63281004041621e-06, + "loss": 0.7274, + "step": 579 + }, + { + "epoch": 1.0035650623885919, + "grad_norm": 1.8151859045028687, + "learning_rate": 9.60457886297124e-06, + "loss": 0.6942, + "step": 580 + }, + { + "epoch": 1.0053475935828877, + "grad_norm": 2.0746374130249023, + "learning_rate": 9.576350841619057e-06, + "loss": 0.7098, + "step": 581 + }, + { + "epoch": 1.0071301247771836, + "grad_norm": 2.0493171215057373, + "learning_rate": 9.548126201664398e-06, + "loss": 0.7328, + "step": 582 + }, + { + "epoch": 1.0089126559714796, + "grad_norm": 1.6757124662399292, + "learning_rate": 9.519905168385004e-06, + "loss": 0.6247, + "step": 583 + }, + { + "epoch": 1.0106951871657754, + "grad_norm": 1.8793476819992065, + "learning_rate": 9.491687967029839e-06, + "loss": 0.6553, + "step": 584 + }, + { + "epoch": 1.0124777183600713, + "grad_norm": 1.6432827711105347, + "learning_rate": 9.463474822817274e-06, + "loss": 0.6913, + "step": 585 + }, + { + "epoch": 1.014260249554367, + "grad_norm": 2.0046606063842773, + "learning_rate": 9.435265960933304e-06, + "loss": 0.6651, + "step": 586 + }, + { + "epoch": 1.0160427807486632, + "grad_norm": 2.0444467067718506, + "learning_rate": 9.407061606529736e-06, + "loss": 0.6596, + "step": 587 + }, + { + "epoch": 1.017825311942959, + "grad_norm": 2.309298515319824, + "learning_rate": 9.378861984722411e-06, + "loss": 0.7316, + "step": 588 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 2.2565701007843018, + "learning_rate": 9.350667320589387e-06, + "loss": 0.6892, + "step": 589 + }, + { + "epoch": 1.0213903743315509, + "grad_norm": 2.022109270095825, + "learning_rate": 9.322477839169156e-06, + "loss": 0.7209, + "step": 590 + }, + { + "epoch": 1.0231729055258467, + "grad_norm": 2.318474054336548, + "learning_rate": 9.294293765458844e-06, + "loss": 0.6836, + "step": 591 + }, + { + "epoch": 1.0249554367201426, + "grad_norm": 2.0525121688842773, + "learning_rate": 9.26611532441241e-06, + "loss": 0.6873, + "step": 592 + }, + { + "epoch": 1.0267379679144386, + "grad_norm": 2.1689891815185547, + "learning_rate": 9.237942740938862e-06, + "loss": 0.7309, + "step": 593 + }, + { + "epoch": 1.0285204991087344, + "grad_norm": 2.0695502758026123, + "learning_rate": 9.209776239900453e-06, + "loss": 0.6479, + "step": 594 + }, + { + "epoch": 1.0303030303030303, + "grad_norm": 2.0197293758392334, + "learning_rate": 9.181616046110889e-06, + "loss": 0.7159, + "step": 595 + }, + { + "epoch": 1.032085561497326, + "grad_norm": 1.8955740928649902, + "learning_rate": 9.153462384333525e-06, + "loss": 0.6547, + "step": 596 + }, + { + "epoch": 1.0338680926916222, + "grad_norm": 1.9541598558425903, + "learning_rate": 9.125315479279597e-06, + "loss": 0.6928, + "step": 597 + }, + { + "epoch": 1.035650623885918, + "grad_norm": 1.8442533016204834, + "learning_rate": 9.097175555606396e-06, + "loss": 0.6086, + "step": 598 + }, + { + "epoch": 1.0374331550802138, + "grad_norm": 1.706773042678833, + "learning_rate": 9.069042837915506e-06, + "loss": 0.6705, + "step": 599 + }, + { + "epoch": 1.0392156862745099, + "grad_norm": 1.8387624025344849, + "learning_rate": 9.040917550750976e-06, + "loss": 0.671, + "step": 600 + }, + { + "epoch": 1.0409982174688057, + "grad_norm": 1.6865826845169067, + "learning_rate": 9.012799918597562e-06, + "loss": 0.6913, + "step": 601 + }, + { + "epoch": 1.0427807486631016, + "grad_norm": 2.1576929092407227, + "learning_rate": 8.98469016587892e-06, + "loss": 0.6028, + "step": 602 + }, + { + "epoch": 1.0445632798573976, + "grad_norm": 1.873191475868225, + "learning_rate": 8.956588516955809e-06, + "loss": 0.6492, + "step": 603 + }, + { + "epoch": 1.0463458110516934, + "grad_norm": 1.9716717004776, + "learning_rate": 8.928495196124311e-06, + "loss": 0.6813, + "step": 604 + }, + { + "epoch": 1.0481283422459893, + "grad_norm": 1.963454008102417, + "learning_rate": 8.900410427614036e-06, + "loss": 0.6293, + "step": 605 + }, + { + "epoch": 1.049910873440285, + "grad_norm": 1.8422149419784546, + "learning_rate": 8.872334435586333e-06, + "loss": 0.6259, + "step": 606 + }, + { + "epoch": 1.0516934046345812, + "grad_norm": 2.425657272338867, + "learning_rate": 8.844267444132499e-06, + "loss": 0.7206, + "step": 607 + }, + { + "epoch": 1.053475935828877, + "grad_norm": 1.8881195783615112, + "learning_rate": 8.816209677271991e-06, + "loss": 0.6916, + "step": 608 + }, + { + "epoch": 1.0552584670231728, + "grad_norm": 2.1058132648468018, + "learning_rate": 8.788161358950649e-06, + "loss": 0.6965, + "step": 609 + }, + { + "epoch": 1.0570409982174689, + "grad_norm": 2.0532283782958984, + "learning_rate": 8.76012271303888e-06, + "loss": 0.6694, + "step": 610 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 1.8037793636322021, + "learning_rate": 8.732093963329908e-06, + "loss": 0.6497, + "step": 611 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 1.674838900566101, + "learning_rate": 8.704075333537963e-06, + "loss": 0.6706, + "step": 612 + }, + { + "epoch": 1.0623885918003566, + "grad_norm": 2.194537878036499, + "learning_rate": 8.676067047296495e-06, + "loss": 0.7005, + "step": 613 + }, + { + "epoch": 1.0641711229946524, + "grad_norm": 1.9183086156845093, + "learning_rate": 8.648069328156403e-06, + "loss": 0.6841, + "step": 614 + }, + { + "epoch": 1.0659536541889483, + "grad_norm": 1.9987599849700928, + "learning_rate": 8.620082399584247e-06, + "loss": 0.6381, + "step": 615 + }, + { + "epoch": 1.067736185383244, + "grad_norm": 1.9794397354125977, + "learning_rate": 8.592106484960447e-06, + "loss": 0.6248, + "step": 616 + }, + { + "epoch": 1.0695187165775402, + "grad_norm": 1.6956665515899658, + "learning_rate": 8.564141807577535e-06, + "loss": 0.671, + "step": 617 + }, + { + "epoch": 1.071301247771836, + "grad_norm": 1.9094434976577759, + "learning_rate": 8.536188590638334e-06, + "loss": 0.6372, + "step": 618 + }, + { + "epoch": 1.0730837789661318, + "grad_norm": 2.042654275894165, + "learning_rate": 8.5082470572542e-06, + "loss": 0.6743, + "step": 619 + }, + { + "epoch": 1.0748663101604279, + "grad_norm": 2.3108088970184326, + "learning_rate": 8.480317430443242e-06, + "loss": 0.7117, + "step": 620 + }, + { + "epoch": 1.0766488413547237, + "grad_norm": 2.5405666828155518, + "learning_rate": 8.452399933128523e-06, + "loss": 0.6924, + "step": 621 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 2.2125980854034424, + "learning_rate": 8.424494788136303e-06, + "loss": 0.6396, + "step": 622 + }, + { + "epoch": 1.0802139037433156, + "grad_norm": 1.8044298887252808, + "learning_rate": 8.396602218194248e-06, + "loss": 0.66, + "step": 623 + }, + { + "epoch": 1.0819964349376114, + "grad_norm": 2.01863169670105, + "learning_rate": 8.368722445929653e-06, + "loss": 0.6819, + "step": 624 + }, + { + "epoch": 1.0837789661319073, + "grad_norm": 2.040804386138916, + "learning_rate": 8.34085569386767e-06, + "loss": 0.7114, + "step": 625 + }, + { + "epoch": 1.085561497326203, + "grad_norm": 2.0602118968963623, + "learning_rate": 8.313002184429529e-06, + "loss": 0.6833, + "step": 626 + }, + { + "epoch": 1.0873440285204992, + "grad_norm": 1.781294345855713, + "learning_rate": 8.285162139930759e-06, + "loss": 0.6208, + "step": 627 + }, + { + "epoch": 1.089126559714795, + "grad_norm": 1.9819340705871582, + "learning_rate": 8.257335782579419e-06, + "loss": 0.6679, + "step": 628 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 2.2654504776000977, + "learning_rate": 8.229523334474328e-06, + "loss": 0.6833, + "step": 629 + }, + { + "epoch": 1.0926916221033869, + "grad_norm": 2.196784734725952, + "learning_rate": 8.201725017603277e-06, + "loss": 0.6615, + "step": 630 + }, + { + "epoch": 1.0944741532976827, + "grad_norm": 2.2365517616271973, + "learning_rate": 8.173941053841275e-06, + "loss": 0.6903, + "step": 631 + }, + { + "epoch": 1.0962566844919786, + "grad_norm": 2.070225954055786, + "learning_rate": 8.146171664948769e-06, + "loss": 0.6732, + "step": 632 + }, + { + "epoch": 1.0980392156862746, + "grad_norm": 1.6552367210388184, + "learning_rate": 8.118417072569871e-06, + "loss": 0.7107, + "step": 633 + }, + { + "epoch": 1.0998217468805704, + "grad_norm": 2.4717843532562256, + "learning_rate": 8.090677498230598e-06, + "loss": 0.733, + "step": 634 + }, + { + "epoch": 1.1016042780748663, + "grad_norm": 1.726193904876709, + "learning_rate": 8.062953163337097e-06, + "loss": 0.7136, + "step": 635 + }, + { + "epoch": 1.1033868092691623, + "grad_norm": 2.0040595531463623, + "learning_rate": 8.035244289173876e-06, + "loss": 0.6663, + "step": 636 + }, + { + "epoch": 1.1051693404634582, + "grad_norm": 1.8458000421524048, + "learning_rate": 8.007551096902055e-06, + "loss": 0.6524, + "step": 637 + }, + { + "epoch": 1.106951871657754, + "grad_norm": 1.8933156728744507, + "learning_rate": 7.97987380755757e-06, + "loss": 0.591, + "step": 638 + }, + { + "epoch": 1.1087344028520498, + "grad_norm": 1.9119715690612793, + "learning_rate": 7.95221264204944e-06, + "loss": 0.624, + "step": 639 + }, + { + "epoch": 1.1105169340463459, + "grad_norm": 2.19096302986145, + "learning_rate": 7.924567821157981e-06, + "loss": 0.7283, + "step": 640 + }, + { + "epoch": 1.1122994652406417, + "grad_norm": 1.7500660419464111, + "learning_rate": 7.896939565533056e-06, + "loss": 0.683, + "step": 641 + }, + { + "epoch": 1.1140819964349375, + "grad_norm": 2.377091407775879, + "learning_rate": 7.869328095692313e-06, + "loss": 0.6901, + "step": 642 + }, + { + "epoch": 1.1158645276292336, + "grad_norm": 2.178258180618286, + "learning_rate": 7.841733632019419e-06, + "loss": 0.6515, + "step": 643 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 2.0514190196990967, + "learning_rate": 7.814156394762306e-06, + "loss": 0.6545, + "step": 644 + }, + { + "epoch": 1.1194295900178253, + "grad_norm": 1.8169552087783813, + "learning_rate": 7.78659660403141e-06, + "loss": 0.6809, + "step": 645 + }, + { + "epoch": 1.121212121212121, + "grad_norm": 1.7938815355300903, + "learning_rate": 7.759054479797924e-06, + "loss": 0.5212, + "step": 646 + }, + { + "epoch": 1.1229946524064172, + "grad_norm": 1.9933950901031494, + "learning_rate": 7.731530241892021e-06, + "loss": 0.6459, + "step": 647 + }, + { + "epoch": 1.124777183600713, + "grad_norm": 1.9721087217330933, + "learning_rate": 7.70402411000112e-06, + "loss": 0.6865, + "step": 648 + }, + { + "epoch": 1.1265597147950088, + "grad_norm": 2.2990550994873047, + "learning_rate": 7.67653630366813e-06, + "loss": 0.7032, + "step": 649 + }, + { + "epoch": 1.1283422459893049, + "grad_norm": 1.9798555374145508, + "learning_rate": 7.649067042289681e-06, + "loss": 0.6512, + "step": 650 + }, + { + "epoch": 1.1301247771836007, + "grad_norm": 2.096360445022583, + "learning_rate": 7.621616545114392e-06, + "loss": 0.6178, + "step": 651 + }, + { + "epoch": 1.1319073083778965, + "grad_norm": 2.1116669178009033, + "learning_rate": 7.594185031241114e-06, + "loss": 0.5875, + "step": 652 + }, + { + "epoch": 1.1336898395721926, + "grad_norm": 1.7805746793746948, + "learning_rate": 7.5667727196171795e-06, + "loss": 0.6481, + "step": 653 + }, + { + "epoch": 1.1354723707664884, + "grad_norm": 2.0286173820495605, + "learning_rate": 7.539379829036652e-06, + "loss": 0.639, + "step": 654 + }, + { + "epoch": 1.1372549019607843, + "grad_norm": 2.071683406829834, + "learning_rate": 7.512006578138593e-06, + "loss": 0.646, + "step": 655 + }, + { + "epoch": 1.1390374331550803, + "grad_norm": 1.7731094360351562, + "learning_rate": 7.484653185405295e-06, + "loss": 0.621, + "step": 656 + }, + { + "epoch": 1.1408199643493762, + "grad_norm": 1.7044860124588013, + "learning_rate": 7.457319869160572e-06, + "loss": 0.6831, + "step": 657 + }, + { + "epoch": 1.142602495543672, + "grad_norm": 1.930659294128418, + "learning_rate": 7.430006847567972e-06, + "loss": 0.6447, + "step": 658 + }, + { + "epoch": 1.1443850267379678, + "grad_norm": 2.157649040222168, + "learning_rate": 7.402714338629072e-06, + "loss": 0.6754, + "step": 659 + }, + { + "epoch": 1.1461675579322639, + "grad_norm": 1.9548604488372803, + "learning_rate": 7.375442560181725e-06, + "loss": 0.6472, + "step": 660 + }, + { + "epoch": 1.1479500891265597, + "grad_norm": 2.099818706512451, + "learning_rate": 7.348191729898315e-06, + "loss": 0.6487, + "step": 661 + }, + { + "epoch": 1.1497326203208555, + "grad_norm": 1.9589630365371704, + "learning_rate": 7.320962065284032e-06, + "loss": 0.6708, + "step": 662 + }, + { + "epoch": 1.1515151515151516, + "grad_norm": 2.367982864379883, + "learning_rate": 7.293753783675132e-06, + "loss": 0.6582, + "step": 663 + }, + { + "epoch": 1.1532976827094474, + "grad_norm": 2.1937007904052734, + "learning_rate": 7.26656710223719e-06, + "loss": 0.641, + "step": 664 + }, + { + "epoch": 1.1550802139037433, + "grad_norm": 1.922019600868225, + "learning_rate": 7.239402237963389e-06, + "loss": 0.6647, + "step": 665 + }, + { + "epoch": 1.156862745098039, + "grad_norm": 2.0569257736206055, + "learning_rate": 7.2122594076727705e-06, + "loss": 0.6558, + "step": 666 + }, + { + "epoch": 1.1586452762923352, + "grad_norm": 1.9273213148117065, + "learning_rate": 7.185138828008509e-06, + "loss": 0.6673, + "step": 667 + }, + { + "epoch": 1.160427807486631, + "grad_norm": 2.145204544067383, + "learning_rate": 7.158040715436184e-06, + "loss": 0.639, + "step": 668 + }, + { + "epoch": 1.1622103386809268, + "grad_norm": 1.9361869096755981, + "learning_rate": 7.1309652862420546e-06, + "loss": 0.7118, + "step": 669 + }, + { + "epoch": 1.1639928698752229, + "grad_norm": 2.302577257156372, + "learning_rate": 7.1039127565313285e-06, + "loss": 0.6418, + "step": 670 + }, + { + "epoch": 1.1657754010695187, + "grad_norm": 1.851583480834961, + "learning_rate": 7.076883342226435e-06, + "loss": 0.6092, + "step": 671 + }, + { + "epoch": 1.1675579322638145, + "grad_norm": 1.97311532497406, + "learning_rate": 7.049877259065312e-06, + "loss": 0.6821, + "step": 672 + }, + { + "epoch": 1.1693404634581106, + "grad_norm": 1.8712797164916992, + "learning_rate": 7.022894722599673e-06, + "loss": 0.6464, + "step": 673 + }, + { + "epoch": 1.1711229946524064, + "grad_norm": 2.1860239505767822, + "learning_rate": 6.995935948193294e-06, + "loss": 0.6394, + "step": 674 + }, + { + "epoch": 1.1729055258467023, + "grad_norm": 2.164442539215088, + "learning_rate": 6.969001151020289e-06, + "loss": 0.665, + "step": 675 + }, + { + "epoch": 1.1746880570409983, + "grad_norm": 2.0195791721343994, + "learning_rate": 6.9420905460633955e-06, + "loss": 0.6232, + "step": 676 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 1.70195734500885, + "learning_rate": 6.915204348112268e-06, + "loss": 0.613, + "step": 677 + }, + { + "epoch": 1.17825311942959, + "grad_norm": 2.0594401359558105, + "learning_rate": 6.888342771761737e-06, + "loss": 0.6564, + "step": 678 + }, + { + "epoch": 1.1800356506238858, + "grad_norm": 1.9122473001480103, + "learning_rate": 6.861506031410125e-06, + "loss": 0.6624, + "step": 679 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 2.3307456970214844, + "learning_rate": 6.834694341257521e-06, + "loss": 0.6366, + "step": 680 + }, + { + "epoch": 1.1836007130124777, + "grad_norm": 1.771002173423767, + "learning_rate": 6.807907915304075e-06, + "loss": 0.637, + "step": 681 + }, + { + "epoch": 1.1853832442067735, + "grad_norm": 1.8012512922286987, + "learning_rate": 6.781146967348283e-06, + "loss": 0.5959, + "step": 682 + }, + { + "epoch": 1.1871657754010696, + "grad_norm": 1.9354641437530518, + "learning_rate": 6.754411710985291e-06, + "loss": 0.5782, + "step": 683 + }, + { + "epoch": 1.1889483065953654, + "grad_norm": 2.067774534225464, + "learning_rate": 6.727702359605185e-06, + "loss": 0.6635, + "step": 684 + }, + { + "epoch": 1.1907308377896613, + "grad_norm": 2.001398801803589, + "learning_rate": 6.701019126391282e-06, + "loss": 0.5945, + "step": 685 + }, + { + "epoch": 1.192513368983957, + "grad_norm": 1.8363351821899414, + "learning_rate": 6.6743622243184405e-06, + "loss": 0.6435, + "step": 686 + }, + { + "epoch": 1.1942959001782532, + "grad_norm": 1.7396736145019531, + "learning_rate": 6.647731866151355e-06, + "loss": 0.6195, + "step": 687 + }, + { + "epoch": 1.196078431372549, + "grad_norm": 2.132824659347534, + "learning_rate": 6.6211282644428475e-06, + "loss": 0.6538, + "step": 688 + }, + { + "epoch": 1.1978609625668448, + "grad_norm": 2.2380311489105225, + "learning_rate": 6.594551631532193e-06, + "loss": 0.6601, + "step": 689 + }, + { + "epoch": 1.1996434937611409, + "grad_norm": 1.9663583040237427, + "learning_rate": 6.568002179543409e-06, + "loss": 0.6255, + "step": 690 + }, + { + "epoch": 1.2014260249554367, + "grad_norm": 2.0859267711639404, + "learning_rate": 6.541480120383558e-06, + "loss": 0.6336, + "step": 691 + }, + { + "epoch": 1.2032085561497325, + "grad_norm": 1.7798055410385132, + "learning_rate": 6.514985665741074e-06, + "loss": 0.6164, + "step": 692 + }, + { + "epoch": 1.2049910873440286, + "grad_norm": 1.9800183773040771, + "learning_rate": 6.488519027084057e-06, + "loss": 0.6288, + "step": 693 + }, + { + "epoch": 1.2067736185383244, + "grad_norm": 2.3849520683288574, + "learning_rate": 6.462080415658591e-06, + "loss": 0.6915, + "step": 694 + }, + { + "epoch": 1.2085561497326203, + "grad_norm": 1.9202960729599, + "learning_rate": 6.435670042487063e-06, + "loss": 0.6595, + "step": 695 + }, + { + "epoch": 1.2103386809269163, + "grad_norm": 1.947290301322937, + "learning_rate": 6.409288118366465e-06, + "loss": 0.6351, + "step": 696 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 1.7837152481079102, + "learning_rate": 6.38293485386673e-06, + "loss": 0.5829, + "step": 697 + }, + { + "epoch": 1.213903743315508, + "grad_norm": 1.7423700094223022, + "learning_rate": 6.356610459329038e-06, + "loss": 0.6512, + "step": 698 + }, + { + "epoch": 1.215686274509804, + "grad_norm": 2.0267062187194824, + "learning_rate": 6.330315144864133e-06, + "loss": 0.6964, + "step": 699 + }, + { + "epoch": 1.2174688057040999, + "grad_norm": 2.088834762573242, + "learning_rate": 6.304049120350664e-06, + "loss": 0.6915, + "step": 700 + }, + { + "epoch": 1.2192513368983957, + "grad_norm": 2.190154790878296, + "learning_rate": 6.277812595433495e-06, + "loss": 0.6787, + "step": 701 + }, + { + "epoch": 1.2210338680926915, + "grad_norm": 2.1694934368133545, + "learning_rate": 6.251605779522032e-06, + "loss": 0.7416, + "step": 702 + }, + { + "epoch": 1.2228163992869876, + "grad_norm": 1.927525520324707, + "learning_rate": 6.225428881788562e-06, + "loss": 0.6981, + "step": 703 + }, + { + "epoch": 1.2245989304812834, + "grad_norm": 1.847321629524231, + "learning_rate": 6.199282111166578e-06, + "loss": 0.6869, + "step": 704 + }, + { + "epoch": 1.2263814616755793, + "grad_norm": 1.8871819972991943, + "learning_rate": 6.173165676349103e-06, + "loss": 0.5826, + "step": 705 + }, + { + "epoch": 1.2263814616755793, + "eval_loss": 0.9314715266227722, + "eval_runtime": 329.4034, + "eval_samples_per_second": 8.931, + "eval_steps_per_second": 1.117, + "step": 705 + }, + { + "epoch": 1.228163992869875, + "grad_norm": 1.7520660161972046, + "learning_rate": 6.147079785787038e-06, + "loss": 0.6601, + "step": 706 + }, + { + "epoch": 1.2299465240641712, + "grad_norm": 2.066180467605591, + "learning_rate": 6.121024647687491e-06, + "loss": 0.6879, + "step": 707 + }, + { + "epoch": 1.231729055258467, + "grad_norm": 1.692914366722107, + "learning_rate": 6.095000470012112e-06, + "loss": 0.6889, + "step": 708 + }, + { + "epoch": 1.2335115864527628, + "grad_norm": 1.7511357069015503, + "learning_rate": 6.069007460475443e-06, + "loss": 0.6463, + "step": 709 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 1.5913302898406982, + "learning_rate": 6.043045826543254e-06, + "loss": 0.6332, + "step": 710 + }, + { + "epoch": 1.2370766488413547, + "grad_norm": 1.9926722049713135, + "learning_rate": 6.017115775430882e-06, + "loss": 0.6227, + "step": 711 + }, + { + "epoch": 1.2388591800356505, + "grad_norm": 2.430294990539551, + "learning_rate": 5.991217514101586e-06, + "loss": 0.6776, + "step": 712 + }, + { + "epoch": 1.2406417112299466, + "grad_norm": 1.9795050621032715, + "learning_rate": 5.965351249264895e-06, + "loss": 0.6389, + "step": 713 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 1.7031593322753906, + "learning_rate": 5.93951718737495e-06, + "loss": 0.5936, + "step": 714 + }, + { + "epoch": 1.2442067736185383, + "grad_norm": 1.8579968214035034, + "learning_rate": 5.91371553462886e-06, + "loss": 0.6108, + "step": 715 + }, + { + "epoch": 1.2459893048128343, + "grad_norm": 1.869901180267334, + "learning_rate": 5.8879464969650645e-06, + "loss": 0.6401, + "step": 716 + }, + { + "epoch": 1.2477718360071302, + "grad_norm": 2.143773317337036, + "learning_rate": 5.862210280061676e-06, + "loss": 0.6346, + "step": 717 + }, + { + "epoch": 1.249554367201426, + "grad_norm": 2.0378427505493164, + "learning_rate": 5.836507089334849e-06, + "loss": 0.5339, + "step": 718 + }, + { + "epoch": 1.251336898395722, + "grad_norm": 2.2269880771636963, + "learning_rate": 5.81083712993713e-06, + "loss": 0.6808, + "step": 719 + }, + { + "epoch": 1.2531194295900179, + "grad_norm": 2.203113555908203, + "learning_rate": 5.785200606755839e-06, + "loss": 0.6719, + "step": 720 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 1.8317898511886597, + "learning_rate": 5.759597724411401e-06, + "loss": 0.6935, + "step": 721 + }, + { + "epoch": 1.2566844919786098, + "grad_norm": 1.8929369449615479, + "learning_rate": 5.7340286872557515e-06, + "loss": 0.5599, + "step": 722 + }, + { + "epoch": 1.2584670231729056, + "grad_norm": 1.679452896118164, + "learning_rate": 5.708493699370681e-06, + "loss": 0.6314, + "step": 723 + }, + { + "epoch": 1.2602495543672014, + "grad_norm": 2.099459409713745, + "learning_rate": 5.682992964566213e-06, + "loss": 0.6407, + "step": 724 + }, + { + "epoch": 1.2620320855614973, + "grad_norm": 2.3213915824890137, + "learning_rate": 5.657526686378975e-06, + "loss": 0.6363, + "step": 725 + }, + { + "epoch": 1.263814616755793, + "grad_norm": 2.0476975440979004, + "learning_rate": 5.6320950680705826e-06, + "loss": 0.7205, + "step": 726 + }, + { + "epoch": 1.2655971479500892, + "grad_norm": 2.2015089988708496, + "learning_rate": 5.606698312625995e-06, + "loss": 0.6329, + "step": 727 + }, + { + "epoch": 1.267379679144385, + "grad_norm": 1.7152899503707886, + "learning_rate": 5.581336622751924e-06, + "loss": 0.6307, + "step": 728 + }, + { + "epoch": 1.2691622103386808, + "grad_norm": 2.1059634685516357, + "learning_rate": 5.556010200875197e-06, + "loss": 0.5633, + "step": 729 + }, + { + "epoch": 1.2709447415329769, + "grad_norm": 2.0322976112365723, + "learning_rate": 5.530719249141148e-06, + "loss": 0.5963, + "step": 730 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 2.0814220905303955, + "learning_rate": 5.505463969412e-06, + "loss": 0.6359, + "step": 731 + }, + { + "epoch": 1.2745098039215685, + "grad_norm": 1.858677864074707, + "learning_rate": 5.480244563265263e-06, + "loss": 0.6106, + "step": 732 + }, + { + "epoch": 1.2762923351158646, + "grad_norm": 2.226047992706299, + "learning_rate": 5.4550612319921044e-06, + "loss": 0.6129, + "step": 733 + }, + { + "epoch": 1.2780748663101604, + "grad_norm": 2.1976687908172607, + "learning_rate": 5.429914176595772e-06, + "loss": 0.6231, + "step": 734 + }, + { + "epoch": 1.2798573975044563, + "grad_norm": 1.840461015701294, + "learning_rate": 5.4048035977899604e-06, + "loss": 0.6002, + "step": 735 + }, + { + "epoch": 1.2816399286987523, + "grad_norm": 1.7595787048339844, + "learning_rate": 5.3797296959972375e-06, + "loss": 0.6185, + "step": 736 + }, + { + "epoch": 1.2834224598930482, + "grad_norm": 2.267902374267578, + "learning_rate": 5.354692671347426e-06, + "loss": 0.6246, + "step": 737 + }, + { + "epoch": 1.285204991087344, + "grad_norm": 1.9644438028335571, + "learning_rate": 5.329692723675994e-06, + "loss": 0.6002, + "step": 738 + }, + { + "epoch": 1.28698752228164, + "grad_norm": 2.6038873195648193, + "learning_rate": 5.304730052522495e-06, + "loss": 0.683, + "step": 739 + }, + { + "epoch": 1.2887700534759359, + "grad_norm": 2.0842905044555664, + "learning_rate": 5.27980485712895e-06, + "loss": 0.6482, + "step": 740 + }, + { + "epoch": 1.2905525846702317, + "grad_norm": 1.7665643692016602, + "learning_rate": 5.254917336438261e-06, + "loss": 0.6249, + "step": 741 + }, + { + "epoch": 1.2923351158645278, + "grad_norm": 1.8952322006225586, + "learning_rate": 5.230067689092629e-06, + "loss": 0.6405, + "step": 742 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 1.8656741380691528, + "learning_rate": 5.205256113431969e-06, + "loss": 0.6718, + "step": 743 + }, + { + "epoch": 1.2959001782531194, + "grad_norm": 1.841829776763916, + "learning_rate": 5.180482807492309e-06, + "loss": 0.6472, + "step": 744 + }, + { + "epoch": 1.2976827094474153, + "grad_norm": 2.3330764770507812, + "learning_rate": 5.1557479690042355e-06, + "loss": 0.6351, + "step": 745 + }, + { + "epoch": 1.299465240641711, + "grad_norm": 2.0946245193481445, + "learning_rate": 5.131051795391302e-06, + "loss": 0.6327, + "step": 746 + }, + { + "epoch": 1.3012477718360071, + "grad_norm": 1.9958549737930298, + "learning_rate": 5.106394483768453e-06, + "loss": 0.5928, + "step": 747 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 2.39805269241333, + "learning_rate": 5.081776230940452e-06, + "loss": 0.6207, + "step": 748 + }, + { + "epoch": 1.3048128342245988, + "grad_norm": 2.0922398567199707, + "learning_rate": 5.057197233400313e-06, + "loss": 0.696, + "step": 749 + }, + { + "epoch": 1.3065953654188949, + "grad_norm": 2.055389642715454, + "learning_rate": 5.03265768732772e-06, + "loss": 0.6663, + "step": 750 + }, + { + "epoch": 1.3083778966131907, + "grad_norm": 2.2379744052886963, + "learning_rate": 5.008157788587485e-06, + "loss": 0.6594, + "step": 751 + }, + { + "epoch": 1.3101604278074865, + "grad_norm": 2.141425609588623, + "learning_rate": 4.983697732727965e-06, + "loss": 0.6661, + "step": 752 + }, + { + "epoch": 1.3119429590017826, + "grad_norm": 2.284132480621338, + "learning_rate": 4.959277714979506e-06, + "loss": 0.6737, + "step": 753 + }, + { + "epoch": 1.3137254901960784, + "grad_norm": 1.9574843645095825, + "learning_rate": 4.934897930252887e-06, + "loss": 0.6836, + "step": 754 + }, + { + "epoch": 1.3155080213903743, + "grad_norm": 1.9436346292495728, + "learning_rate": 4.910558573137763e-06, + "loss": 0.6683, + "step": 755 + }, + { + "epoch": 1.3172905525846703, + "grad_norm": 1.832389235496521, + "learning_rate": 4.886259837901113e-06, + "loss": 0.6404, + "step": 756 + }, + { + "epoch": 1.3190730837789661, + "grad_norm": 1.9720847606658936, + "learning_rate": 4.8620019184856884e-06, + "loss": 0.6259, + "step": 757 + }, + { + "epoch": 1.320855614973262, + "grad_norm": 1.904499888420105, + "learning_rate": 4.837785008508462e-06, + "loss": 0.6508, + "step": 758 + }, + { + "epoch": 1.322638146167558, + "grad_norm": 1.9810409545898438, + "learning_rate": 4.813609301259091e-06, + "loss": 0.6935, + "step": 759 + }, + { + "epoch": 1.3244206773618539, + "grad_norm": 2.132432699203491, + "learning_rate": 4.789474989698368e-06, + "loss": 0.5755, + "step": 760 + }, + { + "epoch": 1.3262032085561497, + "grad_norm": 2.0095694065093994, + "learning_rate": 4.765382266456673e-06, + "loss": 0.6273, + "step": 761 + }, + { + "epoch": 1.3279857397504458, + "grad_norm": 2.2048168182373047, + "learning_rate": 4.7413313238324556e-06, + "loss": 0.6762, + "step": 762 + }, + { + "epoch": 1.3297682709447416, + "grad_norm": 1.8805530071258545, + "learning_rate": 4.717322353790684e-06, + "loss": 0.6606, + "step": 763 + }, + { + "epoch": 1.3315508021390374, + "grad_norm": 1.9276254177093506, + "learning_rate": 4.69335554796132e-06, + "loss": 0.6225, + "step": 764 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.8722336292266846, + "learning_rate": 4.669431097637789e-06, + "loss": 0.624, + "step": 765 + }, + { + "epoch": 1.3351158645276293, + "grad_norm": 1.9843535423278809, + "learning_rate": 4.645549193775452e-06, + "loss": 0.5885, + "step": 766 + }, + { + "epoch": 1.3368983957219251, + "grad_norm": 2.110941171646118, + "learning_rate": 4.6217100269900704e-06, + "loss": 0.5695, + "step": 767 + }, + { + "epoch": 1.338680926916221, + "grad_norm": 2.172638416290283, + "learning_rate": 4.597913787556308e-06, + "loss": 0.6313, + "step": 768 + }, + { + "epoch": 1.3404634581105168, + "grad_norm": 2.2388806343078613, + "learning_rate": 4.574160665406199e-06, + "loss": 0.5842, + "step": 769 + }, + { + "epoch": 1.3422459893048129, + "grad_norm": 2.3002452850341797, + "learning_rate": 4.550450850127626e-06, + "loss": 0.7563, + "step": 770 + }, + { + "epoch": 1.3440285204991087, + "grad_norm": 1.9640158414840698, + "learning_rate": 4.5267845309628164e-06, + "loss": 0.6389, + "step": 771 + }, + { + "epoch": 1.3458110516934045, + "grad_norm": 2.21022629737854, + "learning_rate": 4.503161896806832e-06, + "loss": 0.5932, + "step": 772 + }, + { + "epoch": 1.3475935828877006, + "grad_norm": 1.6235777139663696, + "learning_rate": 4.47958313620606e-06, + "loss": 0.5943, + "step": 773 + }, + { + "epoch": 1.3493761140819964, + "grad_norm": 2.571772575378418, + "learning_rate": 4.4560484373566945e-06, + "loss": 0.667, + "step": 774 + }, + { + "epoch": 1.3511586452762923, + "grad_norm": 1.9727287292480469, + "learning_rate": 4.4325579881032535e-06, + "loss": 0.59, + "step": 775 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 1.6672075986862183, + "learning_rate": 4.409111975937082e-06, + "loss": 0.5799, + "step": 776 + }, + { + "epoch": 1.3547237076648841, + "grad_norm": 1.9198261499404907, + "learning_rate": 4.385710587994836e-06, + "loss": 0.614, + "step": 777 + }, + { + "epoch": 1.35650623885918, + "grad_norm": 2.1884326934814453, + "learning_rate": 4.3623540110569935e-06, + "loss": 0.6769, + "step": 778 + }, + { + "epoch": 1.358288770053476, + "grad_norm": 2.1063883304595947, + "learning_rate": 4.339042431546381e-06, + "loss": 0.6494, + "step": 779 + }, + { + "epoch": 1.3600713012477719, + "grad_norm": 2.000030755996704, + "learning_rate": 4.315776035526666e-06, + "loss": 0.6632, + "step": 780 + }, + { + "epoch": 1.3618538324420677, + "grad_norm": 2.023421287536621, + "learning_rate": 4.292555008700885e-06, + "loss": 0.6351, + "step": 781 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 2.176358699798584, + "learning_rate": 4.26937953640995e-06, + "loss": 0.5972, + "step": 782 + }, + { + "epoch": 1.3654188948306596, + "grad_norm": 1.9683263301849365, + "learning_rate": 4.24624980363118e-06, + "loss": 0.602, + "step": 783 + }, + { + "epoch": 1.3672014260249554, + "grad_norm": 1.9342206716537476, + "learning_rate": 4.223165994976819e-06, + "loss": 0.6334, + "step": 784 + }, + { + "epoch": 1.3689839572192513, + "grad_norm": 1.9298633337020874, + "learning_rate": 4.200128294692555e-06, + "loss": 0.686, + "step": 785 + }, + { + "epoch": 1.3707664884135473, + "grad_norm": 2.0359370708465576, + "learning_rate": 4.177136886656067e-06, + "loss": 0.6508, + "step": 786 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 1.9444752931594849, + "learning_rate": 4.154191954375546e-06, + "loss": 0.6446, + "step": 787 + }, + { + "epoch": 1.374331550802139, + "grad_norm": 2.000025510787964, + "learning_rate": 4.13129368098823e-06, + "loss": 0.5784, + "step": 788 + }, + { + "epoch": 1.3761140819964348, + "grad_norm": 1.9926388263702393, + "learning_rate": 4.108442249258946e-06, + "loss": 0.5659, + "step": 789 + }, + { + "epoch": 1.3778966131907309, + "grad_norm": 2.011845350265503, + "learning_rate": 4.085637841578652e-06, + "loss": 0.6143, + "step": 790 + }, + { + "epoch": 1.3796791443850267, + "grad_norm": 2.0345590114593506, + "learning_rate": 4.062880639962969e-06, + "loss": 0.5627, + "step": 791 + }, + { + "epoch": 1.3814616755793225, + "grad_norm": 1.7945417165756226, + "learning_rate": 4.04017082605075e-06, + "loss": 0.6093, + "step": 792 + }, + { + "epoch": 1.3832442067736186, + "grad_norm": 2.027658700942993, + "learning_rate": 4.017508581102612e-06, + "loss": 0.5853, + "step": 793 + }, + { + "epoch": 1.3850267379679144, + "grad_norm": 2.184375047683716, + "learning_rate": 3.9948940859994964e-06, + "loss": 0.6131, + "step": 794 + }, + { + "epoch": 1.3868092691622103, + "grad_norm": 1.7773215770721436, + "learning_rate": 3.972327521241227e-06, + "loss": 0.6008, + "step": 795 + }, + { + "epoch": 1.3885918003565063, + "grad_norm": 1.9173908233642578, + "learning_rate": 3.949809066945064e-06, + "loss": 0.5749, + "step": 796 + }, + { + "epoch": 1.3903743315508021, + "grad_norm": 2.451683759689331, + "learning_rate": 3.927338902844271e-06, + "loss": 0.704, + "step": 797 + }, + { + "epoch": 1.392156862745098, + "grad_norm": 2.4320666790008545, + "learning_rate": 3.9049172082866786e-06, + "loss": 0.7077, + "step": 798 + }, + { + "epoch": 1.393939393939394, + "grad_norm": 2.23956561088562, + "learning_rate": 3.8825441622332496e-06, + "loss": 0.6582, + "step": 799 + }, + { + "epoch": 1.3957219251336899, + "grad_norm": 2.500241756439209, + "learning_rate": 3.8602199432566585e-06, + "loss": 0.6123, + "step": 800 + }, + { + "epoch": 1.3975044563279857, + "grad_norm": 1.892356276512146, + "learning_rate": 3.837944729539864e-06, + "loss": 0.6394, + "step": 801 + }, + { + "epoch": 1.3992869875222818, + "grad_norm": 2.28920841217041, + "learning_rate": 3.815718698874672e-06, + "loss": 0.59, + "step": 802 + }, + { + "epoch": 1.4010695187165776, + "grad_norm": 1.9007834196090698, + "learning_rate": 3.793542028660342e-06, + "loss": 0.6232, + "step": 803 + }, + { + "epoch": 1.4028520499108734, + "grad_norm": 2.0133168697357178, + "learning_rate": 3.7714148959021523e-06, + "loss": 0.6705, + "step": 804 + }, + { + "epoch": 1.4046345811051695, + "grad_norm": 2.347144603729248, + "learning_rate": 3.7493374772099944e-06, + "loss": 0.6269, + "step": 805 + }, + { + "epoch": 1.4064171122994653, + "grad_norm": 1.9117176532745361, + "learning_rate": 3.727309948796963e-06, + "loss": 0.6566, + "step": 806 + }, + { + "epoch": 1.4081996434937611, + "grad_norm": 2.026123046875, + "learning_rate": 3.7053324864779504e-06, + "loss": 0.6331, + "step": 807 + }, + { + "epoch": 1.409982174688057, + "grad_norm": 2.160001277923584, + "learning_rate": 3.6834052656682316e-06, + "loss": 0.615, + "step": 808 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 2.2837295532226562, + "learning_rate": 3.6615284613820845e-06, + "loss": 0.6356, + "step": 809 + }, + { + "epoch": 1.4135472370766489, + "grad_norm": 2.0919857025146484, + "learning_rate": 3.6397022482313804e-06, + "loss": 0.6751, + "step": 810 + }, + { + "epoch": 1.4153297682709447, + "grad_norm": 1.7666800022125244, + "learning_rate": 3.6179268004241906e-06, + "loss": 0.6264, + "step": 811 + }, + { + "epoch": 1.4171122994652405, + "grad_norm": 1.8978486061096191, + "learning_rate": 3.5962022917633977e-06, + "loss": 0.6581, + "step": 812 + }, + { + "epoch": 1.4188948306595366, + "grad_norm": 1.7779465913772583, + "learning_rate": 3.5745288956453126e-06, + "loss": 0.6369, + "step": 813 + }, + { + "epoch": 1.4206773618538324, + "grad_norm": 1.783486247062683, + "learning_rate": 3.552906785058278e-06, + "loss": 0.6153, + "step": 814 + }, + { + "epoch": 1.4224598930481283, + "grad_norm": 2.0954372882843018, + "learning_rate": 3.531336132581299e-06, + "loss": 0.6021, + "step": 815 + }, + { + "epoch": 1.4242424242424243, + "grad_norm": 2.061905860900879, + "learning_rate": 3.5098171103826716e-06, + "loss": 0.6079, + "step": 816 + }, + { + "epoch": 1.4260249554367201, + "grad_norm": 2.2032501697540283, + "learning_rate": 3.488349890218592e-06, + "loss": 0.6178, + "step": 817 + }, + { + "epoch": 1.427807486631016, + "grad_norm": 2.141941547393799, + "learning_rate": 3.466934643431795e-06, + "loss": 0.6102, + "step": 818 + }, + { + "epoch": 1.429590017825312, + "grad_norm": 2.2551662921905518, + "learning_rate": 3.4455715409501776e-06, + "loss": 0.639, + "step": 819 + }, + { + "epoch": 1.4313725490196079, + "grad_norm": 2.1851370334625244, + "learning_rate": 3.4242607532854533e-06, + "loss": 0.6023, + "step": 820 + }, + { + "epoch": 1.4331550802139037, + "grad_norm": 2.0705204010009766, + "learning_rate": 3.403002450531773e-06, + "loss": 0.5642, + "step": 821 + }, + { + "epoch": 1.4349376114081998, + "grad_norm": 2.305468797683716, + "learning_rate": 3.3817968023643766e-06, + "loss": 0.6493, + "step": 822 + }, + { + "epoch": 1.4367201426024956, + "grad_norm": 2.520127296447754, + "learning_rate": 3.3606439780382348e-06, + "loss": 0.7074, + "step": 823 + }, + { + "epoch": 1.4385026737967914, + "grad_norm": 2.0943031311035156, + "learning_rate": 3.339544146386704e-06, + "loss": 0.6028, + "step": 824 + }, + { + "epoch": 1.4402852049910875, + "grad_norm": 2.1441266536712646, + "learning_rate": 3.3184974758201637e-06, + "loss": 0.5351, + "step": 825 + }, + { + "epoch": 1.4420677361853833, + "grad_norm": 2.0502147674560547, + "learning_rate": 3.2975041343246937e-06, + "loss": 0.6036, + "step": 826 + }, + { + "epoch": 1.4438502673796791, + "grad_norm": 2.0448861122131348, + "learning_rate": 3.2765642894607186e-06, + "loss": 0.6573, + "step": 827 + }, + { + "epoch": 1.445632798573975, + "grad_norm": 1.9928828477859497, + "learning_rate": 3.2556781083616727e-06, + "loss": 0.5587, + "step": 828 + }, + { + "epoch": 1.4474153297682708, + "grad_norm": 1.7939866781234741, + "learning_rate": 3.234845757732673e-06, + "loss": 0.6953, + "step": 829 + }, + { + "epoch": 1.4491978609625669, + "grad_norm": 2.1059212684631348, + "learning_rate": 3.214067403849179e-06, + "loss": 0.6814, + "step": 830 + }, + { + "epoch": 1.4509803921568627, + "grad_norm": 2.4292821884155273, + "learning_rate": 3.193343212555667e-06, + "loss": 0.6433, + "step": 831 + }, + { + "epoch": 1.4527629233511585, + "grad_norm": 1.789070725440979, + "learning_rate": 3.1726733492643157e-06, + "loss": 0.5441, + "step": 832 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 1.9609915018081665, + "learning_rate": 3.1520579789536753e-06, + "loss": 0.6271, + "step": 833 + }, + { + "epoch": 1.4563279857397504, + "grad_norm": 1.9168120622634888, + "learning_rate": 3.1314972661673572e-06, + "loss": 0.6025, + "step": 834 + }, + { + "epoch": 1.4581105169340463, + "grad_norm": 2.104306221008301, + "learning_rate": 3.1109913750127175e-06, + "loss": 0.6286, + "step": 835 + }, + { + "epoch": 1.4598930481283423, + "grad_norm": 1.847529411315918, + "learning_rate": 3.0905404691595476e-06, + "loss": 0.6097, + "step": 836 + }, + { + "epoch": 1.4616755793226381, + "grad_norm": 1.9516671895980835, + "learning_rate": 3.0701447118387673e-06, + "loss": 0.6254, + "step": 837 + }, + { + "epoch": 1.463458110516934, + "grad_norm": 2.1860153675079346, + "learning_rate": 3.0498042658411276e-06, + "loss": 0.5936, + "step": 838 + }, + { + "epoch": 1.46524064171123, + "grad_norm": 2.162168502807617, + "learning_rate": 3.0295192935159e-06, + "loss": 0.6193, + "step": 839 + }, + { + "epoch": 1.4670231729055259, + "grad_norm": 1.9501457214355469, + "learning_rate": 3.009289956769593e-06, + "loss": 0.6031, + "step": 840 + }, + { + "epoch": 1.4688057040998217, + "grad_norm": 2.083054542541504, + "learning_rate": 2.989116417064655e-06, + "loss": 0.5807, + "step": 841 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 2.124634027481079, + "learning_rate": 2.9689988354181742e-06, + "loss": 0.6467, + "step": 842 + }, + { + "epoch": 1.4723707664884136, + "grad_norm": 2.0739710330963135, + "learning_rate": 2.9489373724006164e-06, + "loss": 0.6345, + "step": 843 + }, + { + "epoch": 1.4741532976827094, + "grad_norm": 1.7244806289672852, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.6698, + "step": 844 + }, + { + "epoch": 1.4759358288770055, + "grad_norm": 1.802803874015808, + "learning_rate": 2.908983442293253e-06, + "loss": 0.6248, + "step": 845 + }, + { + "epoch": 1.4777183600713013, + "grad_norm": 2.0008552074432373, + "learning_rate": 2.8890912940996784e-06, + "loss": 0.6227, + "step": 846 + }, + { + "epoch": 1.4777183600713013, + "eval_loss": 0.9287152290344238, + "eval_runtime": 329.6324, + "eval_samples_per_second": 8.925, + "eval_steps_per_second": 1.116, + "step": 846 + }, + { + "epoch": 1.4795008912655971, + "grad_norm": 2.082883358001709, + "learning_rate": 2.8692559023249457e-06, + "loss": 0.6235, + "step": 847 + }, + { + "epoch": 1.481283422459893, + "grad_norm": 2.0350584983825684, + "learning_rate": 2.8494774252871913e-06, + "loss": 0.603, + "step": 848 + }, + { + "epoch": 1.483065953654189, + "grad_norm": 2.247058868408203, + "learning_rate": 2.829756020850274e-06, + "loss": 0.6736, + "step": 849 + }, + { + "epoch": 1.4848484848484849, + "grad_norm": 2.168816328048706, + "learning_rate": 2.8100918464225304e-06, + "loss": 0.652, + "step": 850 + }, + { + "epoch": 1.4866310160427807, + "grad_norm": 2.2460670471191406, + "learning_rate": 2.7904850589555065e-06, + "loss": 0.6744, + "step": 851 + }, + { + "epoch": 1.4884135472370765, + "grad_norm": 1.918690800666809, + "learning_rate": 2.7709358149427114e-06, + "loss": 0.6582, + "step": 852 + }, + { + "epoch": 1.4901960784313726, + "grad_norm": 2.372283458709717, + "learning_rate": 2.751444270418361e-06, + "loss": 0.6133, + "step": 853 + }, + { + "epoch": 1.4919786096256684, + "grad_norm": 1.9066613912582397, + "learning_rate": 2.7320105809561415e-06, + "loss": 0.605, + "step": 854 + }, + { + "epoch": 1.4937611408199643, + "grad_norm": 2.066586494445801, + "learning_rate": 2.7126349016679587e-06, + "loss": 0.6999, + "step": 855 + }, + { + "epoch": 1.4955436720142603, + "grad_norm": 2.1307895183563232, + "learning_rate": 2.6933173872027096e-06, + "loss": 0.6343, + "step": 856 + }, + { + "epoch": 1.4973262032085561, + "grad_norm": 2.0337185859680176, + "learning_rate": 2.674058191745038e-06, + "loss": 0.6076, + "step": 857 + }, + { + "epoch": 1.499108734402852, + "grad_norm": 2.2143378257751465, + "learning_rate": 2.654857469014113e-06, + "loss": 0.6181, + "step": 858 + }, + { + "epoch": 1.500891265597148, + "grad_norm": 2.016242265701294, + "learning_rate": 2.6357153722623976e-06, + "loss": 0.66, + "step": 859 + }, + { + "epoch": 1.5026737967914439, + "grad_norm": 2.317138433456421, + "learning_rate": 2.6166320542744184e-06, + "loss": 0.6371, + "step": 860 + }, + { + "epoch": 1.5044563279857397, + "grad_norm": 1.8911103010177612, + "learning_rate": 2.5976076673655628e-06, + "loss": 0.6335, + "step": 861 + }, + { + "epoch": 1.5062388591800357, + "grad_norm": 2.6606857776641846, + "learning_rate": 2.5786423633808487e-06, + "loss": 0.6414, + "step": 862 + }, + { + "epoch": 1.5080213903743316, + "grad_norm": 1.818349003791809, + "learning_rate": 2.559736293693721e-06, + "loss": 0.6156, + "step": 863 + }, + { + "epoch": 1.5098039215686274, + "grad_norm": 1.7764534950256348, + "learning_rate": 2.5408896092048384e-06, + "loss": 0.6441, + "step": 864 + }, + { + "epoch": 1.5115864527629235, + "grad_norm": 2.2821059226989746, + "learning_rate": 2.522102460340874e-06, + "loss": 0.6253, + "step": 865 + }, + { + "epoch": 1.5133689839572193, + "grad_norm": 2.0185694694519043, + "learning_rate": 2.5033749970533015e-06, + "loss": 0.6091, + "step": 866 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 2.3071510791778564, + "learning_rate": 2.484707368817221e-06, + "loss": 0.6268, + "step": 867 + }, + { + "epoch": 1.5169340463458112, + "grad_norm": 2.0063817501068115, + "learning_rate": 2.4660997246301445e-06, + "loss": 0.6439, + "step": 868 + }, + { + "epoch": 1.5187165775401068, + "grad_norm": 2.095769166946411, + "learning_rate": 2.447552213010821e-06, + "loss": 0.6397, + "step": 869 + }, + { + "epoch": 1.5204991087344029, + "grad_norm": 2.276026487350464, + "learning_rate": 2.4290649819980404e-06, + "loss": 0.6755, + "step": 870 + }, + { + "epoch": 1.522281639928699, + "grad_norm": 2.0926215648651123, + "learning_rate": 2.410638179149465e-06, + "loss": 0.581, + "step": 871 + }, + { + "epoch": 1.5240641711229945, + "grad_norm": 1.8585364818572998, + "learning_rate": 2.3922719515404303e-06, + "loss": 0.6108, + "step": 872 + }, + { + "epoch": 1.5258467023172906, + "grad_norm": 2.319448709487915, + "learning_rate": 2.3739664457627974e-06, + "loss": 0.6521, + "step": 873 + }, + { + "epoch": 1.5276292335115864, + "grad_norm": 2.0976061820983887, + "learning_rate": 2.3557218079237608e-06, + "loss": 0.6235, + "step": 874 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 2.241476058959961, + "learning_rate": 2.337538183644702e-06, + "loss": 0.612, + "step": 875 + }, + { + "epoch": 1.5311942959001783, + "grad_norm": 2.2935774326324463, + "learning_rate": 2.319415718060011e-06, + "loss": 0.6827, + "step": 876 + }, + { + "epoch": 1.5329768270944741, + "grad_norm": 2.440673589706421, + "learning_rate": 2.301354555815928e-06, + "loss": 0.6346, + "step": 877 + }, + { + "epoch": 1.53475935828877, + "grad_norm": 2.0865209102630615, + "learning_rate": 2.283354841069403e-06, + "loss": 0.6396, + "step": 878 + }, + { + "epoch": 1.536541889483066, + "grad_norm": 2.1622796058654785, + "learning_rate": 2.2654167174869325e-06, + "loss": 0.6198, + "step": 879 + }, + { + "epoch": 1.5383244206773619, + "grad_norm": 2.0409114360809326, + "learning_rate": 2.2475403282434193e-06, + "loss": 0.6023, + "step": 880 + }, + { + "epoch": 1.5401069518716577, + "grad_norm": 2.097445249557495, + "learning_rate": 2.2297258160210255e-06, + "loss": 0.6092, + "step": 881 + }, + { + "epoch": 1.5418894830659537, + "grad_norm": 2.039428234100342, + "learning_rate": 2.211973323008041e-06, + "loss": 0.6371, + "step": 882 + }, + { + "epoch": 1.5436720142602496, + "grad_norm": 1.863448977470398, + "learning_rate": 2.1942829908977315e-06, + "loss": 0.6145, + "step": 883 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 1.8281656503677368, + "learning_rate": 2.176654960887231e-06, + "loss": 0.6095, + "step": 884 + }, + { + "epoch": 1.5472370766488415, + "grad_norm": 1.9864037036895752, + "learning_rate": 2.1590893736763974e-06, + "loss": 0.6077, + "step": 885 + }, + { + "epoch": 1.5490196078431373, + "grad_norm": 1.857020616531372, + "learning_rate": 2.1415863694666973e-06, + "loss": 0.602, + "step": 886 + }, + { + "epoch": 1.5508021390374331, + "grad_norm": 1.8649786710739136, + "learning_rate": 2.124146087960085e-06, + "loss": 0.6169, + "step": 887 + }, + { + "epoch": 1.5525846702317292, + "grad_norm": 1.9944473505020142, + "learning_rate": 2.106768668357888e-06, + "loss": 0.5788, + "step": 888 + }, + { + "epoch": 1.5543672014260248, + "grad_norm": 2.0956411361694336, + "learning_rate": 2.089454249359689e-06, + "loss": 0.6517, + "step": 889 + }, + { + "epoch": 1.5561497326203209, + "grad_norm": 2.172884702682495, + "learning_rate": 2.072202969162234e-06, + "loss": 0.6386, + "step": 890 + }, + { + "epoch": 1.557932263814617, + "grad_norm": 2.040494203567505, + "learning_rate": 2.055014965458314e-06, + "loss": 0.6114, + "step": 891 + }, + { + "epoch": 1.5597147950089125, + "grad_norm": 1.7873563766479492, + "learning_rate": 2.0378903754356772e-06, + "loss": 0.6609, + "step": 892 + }, + { + "epoch": 1.5614973262032086, + "grad_norm": 2.231397867202759, + "learning_rate": 2.0208293357759266e-06, + "loss": 0.6615, + "step": 893 + }, + { + "epoch": 1.5632798573975044, + "grad_norm": 1.6660290956497192, + "learning_rate": 2.0038319826534312e-06, + "loss": 0.5974, + "step": 894 + }, + { + "epoch": 1.5650623885918002, + "grad_norm": 2.1246089935302734, + "learning_rate": 1.986898451734243e-06, + "loss": 0.6234, + "step": 895 + }, + { + "epoch": 1.5668449197860963, + "grad_norm": 2.0004465579986572, + "learning_rate": 1.9700288781750043e-06, + "loss": 0.5632, + "step": 896 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 2.172769546508789, + "learning_rate": 1.953223396621884e-06, + "loss": 0.6343, + "step": 897 + }, + { + "epoch": 1.570409982174688, + "grad_norm": 2.3006234169006348, + "learning_rate": 1.936482141209486e-06, + "loss": 0.603, + "step": 898 + }, + { + "epoch": 1.572192513368984, + "grad_norm": 1.8977664709091187, + "learning_rate": 1.919805245559796e-06, + "loss": 0.6998, + "step": 899 + }, + { + "epoch": 1.5739750445632799, + "grad_norm": 1.7701358795166016, + "learning_rate": 1.903192842781094e-06, + "loss": 0.6292, + "step": 900 + }, + { + "epoch": 1.5757575757575757, + "grad_norm": 2.157860040664673, + "learning_rate": 1.8866450654669133e-06, + "loss": 0.6096, + "step": 901 + }, + { + "epoch": 1.5775401069518717, + "grad_norm": 1.5891040563583374, + "learning_rate": 1.870162045694971e-06, + "loss": 0.5585, + "step": 902 + }, + { + "epoch": 1.5793226381461676, + "grad_norm": 2.1544394493103027, + "learning_rate": 1.8537439150261126e-06, + "loss": 0.6436, + "step": 903 + }, + { + "epoch": 1.5811051693404634, + "grad_norm": 1.7956115007400513, + "learning_rate": 1.8373908045032685e-06, + "loss": 0.5886, + "step": 904 + }, + { + "epoch": 1.5828877005347595, + "grad_norm": 2.07564115524292, + "learning_rate": 1.821102844650403e-06, + "loss": 0.577, + "step": 905 + }, + { + "epoch": 1.5846702317290553, + "grad_norm": 2.2092342376708984, + "learning_rate": 1.8048801654714687e-06, + "loss": 0.6499, + "step": 906 + }, + { + "epoch": 1.5864527629233511, + "grad_norm": 2.127253532409668, + "learning_rate": 1.788722896449383e-06, + "loss": 0.5997, + "step": 907 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 2.1936185359954834, + "learning_rate": 1.772631166544979e-06, + "loss": 0.63, + "step": 908 + }, + { + "epoch": 1.5900178253119428, + "grad_norm": 1.8700515031814575, + "learning_rate": 1.7566051041959864e-06, + "loss": 0.6522, + "step": 909 + }, + { + "epoch": 1.5918003565062389, + "grad_norm": 2.1035735607147217, + "learning_rate": 1.7406448373160024e-06, + "loss": 0.6252, + "step": 910 + }, + { + "epoch": 1.593582887700535, + "grad_norm": 1.6864609718322754, + "learning_rate": 1.7247504932934688e-06, + "loss": 0.6087, + "step": 911 + }, + { + "epoch": 1.5953654188948305, + "grad_norm": 2.179560661315918, + "learning_rate": 1.7089221989906634e-06, + "loss": 0.6516, + "step": 912 + }, + { + "epoch": 1.5971479500891266, + "grad_norm": 1.7755348682403564, + "learning_rate": 1.693160080742673e-06, + "loss": 0.5788, + "step": 913 + }, + { + "epoch": 1.5989304812834224, + "grad_norm": 1.8449007272720337, + "learning_rate": 1.6774642643563955e-06, + "loss": 0.5863, + "step": 914 + }, + { + "epoch": 1.6007130124777182, + "grad_norm": 2.1267855167388916, + "learning_rate": 1.6618348751095448e-06, + "loss": 0.5896, + "step": 915 + }, + { + "epoch": 1.6024955436720143, + "grad_norm": 1.7636414766311646, + "learning_rate": 1.64627203774963e-06, + "loss": 0.6364, + "step": 916 + }, + { + "epoch": 1.6042780748663101, + "grad_norm": 1.9327129125595093, + "learning_rate": 1.630775876492967e-06, + "loss": 0.5962, + "step": 917 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 2.0896499156951904, + "learning_rate": 1.615346515023698e-06, + "loss": 0.6945, + "step": 918 + }, + { + "epoch": 1.607843137254902, + "grad_norm": 1.82899808883667, + "learning_rate": 1.599984076492791e-06, + "loss": 0.5981, + "step": 919 + }, + { + "epoch": 1.6096256684491979, + "grad_norm": 1.8291279077529907, + "learning_rate": 1.5846886835170649e-06, + "loss": 0.6005, + "step": 920 + }, + { + "epoch": 1.6114081996434937, + "grad_norm": 2.2317283153533936, + "learning_rate": 1.5694604581782059e-06, + "loss": 0.6218, + "step": 921 + }, + { + "epoch": 1.6131907308377897, + "grad_norm": 2.006565809249878, + "learning_rate": 1.5542995220217961e-06, + "loss": 0.6008, + "step": 922 + }, + { + "epoch": 1.6149732620320856, + "grad_norm": 2.1319220066070557, + "learning_rate": 1.5392059960563444e-06, + "loss": 0.6309, + "step": 923 + }, + { + "epoch": 1.6167557932263814, + "grad_norm": 1.8702588081359863, + "learning_rate": 1.5241800007523123e-06, + "loss": 0.6081, + "step": 924 + }, + { + "epoch": 1.6185383244206775, + "grad_norm": 2.236720561981201, + "learning_rate": 1.5092216560411631e-06, + "loss": 0.6133, + "step": 925 + }, + { + "epoch": 1.6203208556149733, + "grad_norm": 2.221240282058716, + "learning_rate": 1.4943310813144006e-06, + "loss": 0.6624, + "step": 926 + }, + { + "epoch": 1.6221033868092691, + "grad_norm": 1.9202772378921509, + "learning_rate": 1.4795083954226153e-06, + "loss": 0.6069, + "step": 927 + }, + { + "epoch": 1.6238859180035652, + "grad_norm": 2.2788867950439453, + "learning_rate": 1.464753716674535e-06, + "loss": 0.6444, + "step": 928 + }, + { + "epoch": 1.6256684491978608, + "grad_norm": 1.9460335969924927, + "learning_rate": 1.4500671628360863e-06, + "loss": 0.541, + "step": 929 + }, + { + "epoch": 1.6274509803921569, + "grad_norm": 2.113252878189087, + "learning_rate": 1.4354488511294418e-06, + "loss": 0.6688, + "step": 930 + }, + { + "epoch": 1.629233511586453, + "grad_norm": 1.8469072580337524, + "learning_rate": 1.420898898232098e-06, + "loss": 0.6, + "step": 931 + }, + { + "epoch": 1.6310160427807485, + "grad_norm": 1.9188716411590576, + "learning_rate": 1.4064174202759405e-06, + "loss": 0.638, + "step": 932 + }, + { + "epoch": 1.6327985739750446, + "grad_norm": 2.0176281929016113, + "learning_rate": 1.3920045328463116e-06, + "loss": 0.5726, + "step": 933 + }, + { + "epoch": 1.6345811051693404, + "grad_norm": 2.013188362121582, + "learning_rate": 1.3776603509810938e-06, + "loss": 0.5888, + "step": 934 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 1.9507560729980469, + "learning_rate": 1.3633849891697882e-06, + "loss": 0.6518, + "step": 935 + }, + { + "epoch": 1.6381461675579323, + "grad_norm": 1.9589416980743408, + "learning_rate": 1.349178561352603e-06, + "loss": 0.604, + "step": 936 + }, + { + "epoch": 1.6399286987522281, + "grad_norm": 1.811722993850708, + "learning_rate": 1.335041180919543e-06, + "loss": 0.662, + "step": 937 + }, + { + "epoch": 1.641711229946524, + "grad_norm": 2.11613392829895, + "learning_rate": 1.3209729607095022e-06, + "loss": 0.6149, + "step": 938 + }, + { + "epoch": 1.64349376114082, + "grad_norm": 1.6558808088302612, + "learning_rate": 1.3069740130093678e-06, + "loss": 0.6301, + "step": 939 + }, + { + "epoch": 1.6452762923351159, + "grad_norm": 2.1352925300598145, + "learning_rate": 1.2930444495531237e-06, + "loss": 0.637, + "step": 940 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 2.76387357711792, + "learning_rate": 1.2791843815209494e-06, + "loss": 0.6461, + "step": 941 + }, + { + "epoch": 1.6488413547237077, + "grad_norm": 2.2259490489959717, + "learning_rate": 1.2653939195383448e-06, + "loss": 0.6129, + "step": 942 + }, + { + "epoch": 1.6506238859180036, + "grad_norm": 2.03933048248291, + "learning_rate": 1.251673173675243e-06, + "loss": 0.6154, + "step": 943 + }, + { + "epoch": 1.6524064171122994, + "grad_norm": 2.0698153972625732, + "learning_rate": 1.2380222534451302e-06, + "loss": 0.6206, + "step": 944 + }, + { + "epoch": 1.6541889483065955, + "grad_norm": 1.9460407495498657, + "learning_rate": 1.2244412678041707e-06, + "loss": 0.6721, + "step": 945 + }, + { + "epoch": 1.6559714795008913, + "grad_norm": 2.335196018218994, + "learning_rate": 1.2109303251503434e-06, + "loss": 0.6158, + "step": 946 + }, + { + "epoch": 1.6577540106951871, + "grad_norm": 2.0459136962890625, + "learning_rate": 1.1974895333225646e-06, + "loss": 0.6597, + "step": 947 + }, + { + "epoch": 1.6595365418894832, + "grad_norm": 2.3721280097961426, + "learning_rate": 1.1841189995998404e-06, + "loss": 0.6127, + "step": 948 + }, + { + "epoch": 1.661319073083779, + "grad_norm": 2.00299334526062, + "learning_rate": 1.1708188307004043e-06, + "loss": 0.564, + "step": 949 + }, + { + "epoch": 1.6631016042780749, + "grad_norm": 2.1417782306671143, + "learning_rate": 1.1575891327808664e-06, + "loss": 0.5406, + "step": 950 + }, + { + "epoch": 1.664884135472371, + "grad_norm": 2.224438428878784, + "learning_rate": 1.1444300114353647e-06, + "loss": 0.5907, + "step": 951 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.8061896562576294, + "learning_rate": 1.131341571694724e-06, + "loss": 0.6806, + "step": 952 + }, + { + "epoch": 1.6684491978609626, + "grad_norm": 1.9865063428878784, + "learning_rate": 1.1183239180256133e-06, + "loss": 0.5527, + "step": 953 + }, + { + "epoch": 1.6702317290552586, + "grad_norm": 1.6837379932403564, + "learning_rate": 1.1053771543297198e-06, + "loss": 0.6019, + "step": 954 + }, + { + "epoch": 1.6720142602495542, + "grad_norm": 2.2152931690216064, + "learning_rate": 1.0925013839429166e-06, + "loss": 0.6155, + "step": 955 + }, + { + "epoch": 1.6737967914438503, + "grad_norm": 1.8260977268218994, + "learning_rate": 1.0796967096344324e-06, + "loss": 0.6033, + "step": 956 + }, + { + "epoch": 1.6755793226381461, + "grad_norm": 2.218449592590332, + "learning_rate": 1.066963233606041e-06, + "loss": 0.6262, + "step": 957 + }, + { + "epoch": 1.677361853832442, + "grad_norm": 1.9861358404159546, + "learning_rate": 1.0543010574912305e-06, + "loss": 0.6212, + "step": 958 + }, + { + "epoch": 1.679144385026738, + "grad_norm": 2.2905385494232178, + "learning_rate": 1.0417102823544112e-06, + "loss": 0.5466, + "step": 959 + }, + { + "epoch": 1.6809269162210339, + "grad_norm": 2.123490571975708, + "learning_rate": 1.0291910086900968e-06, + "loss": 0.6122, + "step": 960 + }, + { + "epoch": 1.6827094474153297, + "grad_norm": 2.042818546295166, + "learning_rate": 1.0167433364221024e-06, + "loss": 0.6353, + "step": 961 + }, + { + "epoch": 1.6844919786096257, + "grad_norm": 1.9634802341461182, + "learning_rate": 1.0043673649027519e-06, + "loss": 0.6569, + "step": 962 + }, + { + "epoch": 1.6862745098039216, + "grad_norm": 2.4289674758911133, + "learning_rate": 9.92063192912085e-07, + "loss": 0.6073, + "step": 963 + }, + { + "epoch": 1.6880570409982174, + "grad_norm": 1.9238197803497314, + "learning_rate": 9.798309186570588e-07, + "loss": 0.6272, + "step": 964 + }, + { + "epoch": 1.6898395721925135, + "grad_norm": 2.2965667247772217, + "learning_rate": 9.676706397707769e-07, + "loss": 0.594, + "step": 965 + }, + { + "epoch": 1.6916221033868093, + "grad_norm": 2.0149905681610107, + "learning_rate": 9.555824533117064e-07, + "loss": 0.6251, + "step": 966 + }, + { + "epoch": 1.6934046345811051, + "grad_norm": 1.921858549118042, + "learning_rate": 9.435664557628976e-07, + "loss": 0.6106, + "step": 967 + }, + { + "epoch": 1.6951871657754012, + "grad_norm": 1.854486107826233, + "learning_rate": 9.316227430312196e-07, + "loss": 0.6605, + "step": 968 + }, + { + "epoch": 1.696969696969697, + "grad_norm": 2.0480058193206787, + "learning_rate": 9.197514104465955e-07, + "loss": 0.655, + "step": 969 + }, + { + "epoch": 1.6987522281639929, + "grad_norm": 2.107775926589966, + "learning_rate": 9.079525527612321e-07, + "loss": 0.5763, + "step": 970 + }, + { + "epoch": 1.700534759358289, + "grad_norm": 1.6507351398468018, + "learning_rate": 8.962262641488772e-07, + "loss": 0.5638, + "step": 971 + }, + { + "epoch": 1.7023172905525845, + "grad_norm": 1.974533200263977, + "learning_rate": 8.845726382040598e-07, + "loss": 0.6469, + "step": 972 + }, + { + "epoch": 1.7040998217468806, + "grad_norm": 2.504983901977539, + "learning_rate": 8.729917679413435e-07, + "loss": 0.5874, + "step": 973 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 2.3748109340667725, + "learning_rate": 8.614837457945868e-07, + "loss": 0.6213, + "step": 974 + }, + { + "epoch": 1.7076648841354722, + "grad_norm": 2.0784034729003906, + "learning_rate": 8.500486636162031e-07, + "loss": 0.6713, + "step": 975 + }, + { + "epoch": 1.7094474153297683, + "grad_norm": 2.2157113552093506, + "learning_rate": 8.38686612676427e-07, + "loss": 0.5642, + "step": 976 + }, + { + "epoch": 1.7112299465240641, + "grad_norm": 1.57169771194458, + "learning_rate": 8.273976836625897e-07, + "loss": 0.6263, + "step": 977 + }, + { + "epoch": 1.71301247771836, + "grad_norm": 2.015939474105835, + "learning_rate": 8.161819666783888e-07, + "loss": 0.5912, + "step": 978 + }, + { + "epoch": 1.714795008912656, + "grad_norm": 2.3326618671417236, + "learning_rate": 8.050395512431775e-07, + "loss": 0.5949, + "step": 979 + }, + { + "epoch": 1.7165775401069518, + "grad_norm": 2.1025454998016357, + "learning_rate": 7.939705262912423e-07, + "loss": 0.5597, + "step": 980 + }, + { + "epoch": 1.7183600713012477, + "grad_norm": 2.205465793609619, + "learning_rate": 7.829749801710961e-07, + "loss": 0.5117, + "step": 981 + }, + { + "epoch": 1.7201426024955437, + "grad_norm": 1.9855906963348389, + "learning_rate": 7.720530006447735e-07, + "loss": 0.6084, + "step": 982 + }, + { + "epoch": 1.7219251336898396, + "grad_norm": 2.0389535427093506, + "learning_rate": 7.612046748871327e-07, + "loss": 0.5774, + "step": 983 + }, + { + "epoch": 1.7237076648841354, + "grad_norm": 2.122084617614746, + "learning_rate": 7.504300894851546e-07, + "loss": 0.5661, + "step": 984 + }, + { + "epoch": 1.7254901960784315, + "grad_norm": 1.9116697311401367, + "learning_rate": 7.397293304372544e-07, + "loss": 0.5986, + "step": 985 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 1.990723967552185, + "learning_rate": 7.291024831525961e-07, + "loss": 0.5841, + "step": 986 + }, + { + "epoch": 1.7290552584670231, + "grad_norm": 2.1296963691711426, + "learning_rate": 7.185496324504093e-07, + "loss": 0.6501, + "step": 987 + }, + { + "epoch": 1.7290552584670231, + "eval_loss": 0.910054087638855, + "eval_runtime": 329.8702, + "eval_samples_per_second": 8.919, + "eval_steps_per_second": 1.116, + "step": 987 + }, + { + "epoch": 1.7308377896613192, + "grad_norm": 1.690833330154419, + "learning_rate": 7.080708625593103e-07, + "loss": 0.581, + "step": 988 + }, + { + "epoch": 1.732620320855615, + "grad_norm": 2.084317922592163, + "learning_rate": 6.976662571166348e-07, + "loss": 0.6071, + "step": 989 + }, + { + "epoch": 1.7344028520499108, + "grad_norm": 2.110623598098755, + "learning_rate": 6.87335899167767e-07, + "loss": 0.594, + "step": 990 + }, + { + "epoch": 1.736185383244207, + "grad_norm": 1.95883047580719, + "learning_rate": 6.77079871165478e-07, + "loss": 0.5884, + "step": 991 + }, + { + "epoch": 1.7379679144385025, + "grad_norm": 2.2620203495025635, + "learning_rate": 6.668982549692648e-07, + "loss": 0.569, + "step": 992 + }, + { + "epoch": 1.7397504456327986, + "grad_norm": 1.84707772731781, + "learning_rate": 6.567911318447018e-07, + "loss": 0.5643, + "step": 993 + }, + { + "epoch": 1.7415329768270946, + "grad_norm": 2.352470874786377, + "learning_rate": 6.467585824627886e-07, + "loss": 0.6341, + "step": 994 + }, + { + "epoch": 1.7433155080213902, + "grad_norm": 1.957634449005127, + "learning_rate": 6.368006868993071e-07, + "loss": 0.5805, + "step": 995 + }, + { + "epoch": 1.7450980392156863, + "grad_norm": 1.8500100374221802, + "learning_rate": 6.269175246341818e-07, + "loss": 0.6277, + "step": 996 + }, + { + "epoch": 1.7468805704099821, + "grad_norm": 2.1289892196655273, + "learning_rate": 6.171091745508484e-07, + "loss": 0.5556, + "step": 997 + }, + { + "epoch": 1.748663101604278, + "grad_norm": 2.2112717628479004, + "learning_rate": 6.073757149356185e-07, + "loss": 0.6379, + "step": 998 + }, + { + "epoch": 1.750445632798574, + "grad_norm": 1.6776251792907715, + "learning_rate": 5.977172234770589e-07, + "loss": 0.6103, + "step": 999 + }, + { + "epoch": 1.7522281639928698, + "grad_norm": 1.8559027910232544, + "learning_rate": 5.881337772653728e-07, + "loss": 0.6486, + "step": 1000 + }, + { + "epoch": 1.7540106951871657, + "grad_norm": 1.9796710014343262, + "learning_rate": 5.786254527917801e-07, + "loss": 0.5668, + "step": 1001 + }, + { + "epoch": 1.7557932263814617, + "grad_norm": 2.122915029525757, + "learning_rate": 5.691923259479093e-07, + "loss": 0.5901, + "step": 1002 + }, + { + "epoch": 1.7575757575757576, + "grad_norm": 1.7617076635360718, + "learning_rate": 5.598344720251935e-07, + "loss": 0.6103, + "step": 1003 + }, + { + "epoch": 1.7593582887700534, + "grad_norm": 2.4237117767333984, + "learning_rate": 5.505519657142677e-07, + "loss": 0.6668, + "step": 1004 + }, + { + "epoch": 1.7611408199643495, + "grad_norm": 2.0379695892333984, + "learning_rate": 5.413448811043676e-07, + "loss": 0.5855, + "step": 1005 + }, + { + "epoch": 1.7629233511586453, + "grad_norm": 2.066530704498291, + "learning_rate": 5.322132916827483e-07, + "loss": 0.606, + "step": 1006 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 2.2446866035461426, + "learning_rate": 5.231572703340904e-07, + "loss": 0.6629, + "step": 1007 + }, + { + "epoch": 1.7664884135472372, + "grad_norm": 2.5455820560455322, + "learning_rate": 5.141768893399224e-07, + "loss": 0.6387, + "step": 1008 + }, + { + "epoch": 1.768270944741533, + "grad_norm": 2.1626667976379395, + "learning_rate": 5.052722203780391e-07, + "loss": 0.6181, + "step": 1009 + }, + { + "epoch": 1.7700534759358288, + "grad_norm": 2.0961239337921143, + "learning_rate": 4.964433345219354e-07, + "loss": 0.6174, + "step": 1010 + }, + { + "epoch": 1.771836007130125, + "grad_norm": 2.0906381607055664, + "learning_rate": 4.876903022402324e-07, + "loss": 0.5807, + "step": 1011 + }, + { + "epoch": 1.7736185383244205, + "grad_norm": 1.672275185585022, + "learning_rate": 4.790131933961207e-07, + "loss": 0.5946, + "step": 1012 + }, + { + "epoch": 1.7754010695187166, + "grad_norm": 2.0058467388153076, + "learning_rate": 4.7041207724679904e-07, + "loss": 0.6311, + "step": 1013 + }, + { + "epoch": 1.7771836007130126, + "grad_norm": 1.8891334533691406, + "learning_rate": 4.6188702244292614e-07, + "loss": 0.6957, + "step": 1014 + }, + { + "epoch": 1.7789661319073082, + "grad_norm": 1.8195239305496216, + "learning_rate": 4.5343809702806606e-07, + "loss": 0.5564, + "step": 1015 + }, + { + "epoch": 1.7807486631016043, + "grad_norm": 1.9872279167175293, + "learning_rate": 4.4506536843815006e-07, + "loss": 0.6647, + "step": 1016 + }, + { + "epoch": 1.7825311942959001, + "grad_norm": 2.025951623916626, + "learning_rate": 4.367689035009359e-07, + "loss": 0.5382, + "step": 1017 + }, + { + "epoch": 1.784313725490196, + "grad_norm": 2.5888593196868896, + "learning_rate": 4.285487684354772e-07, + "loss": 0.6339, + "step": 1018 + }, + { + "epoch": 1.786096256684492, + "grad_norm": 1.9204456806182861, + "learning_rate": 4.2040502885159264e-07, + "loss": 0.6232, + "step": 1019 + }, + { + "epoch": 1.7878787878787878, + "grad_norm": 2.0572903156280518, + "learning_rate": 4.123377497493419e-07, + "loss": 0.5806, + "step": 1020 + }, + { + "epoch": 1.7896613190730837, + "grad_norm": 1.7260277271270752, + "learning_rate": 4.0434699551850973e-07, + "loss": 0.6254, + "step": 1021 + }, + { + "epoch": 1.7914438502673797, + "grad_norm": 1.9684414863586426, + "learning_rate": 3.96432829938086e-07, + "loss": 0.5893, + "step": 1022 + }, + { + "epoch": 1.7932263814616756, + "grad_norm": 2.3725926876068115, + "learning_rate": 3.88595316175765e-07, + "loss": 0.5836, + "step": 1023 + }, + { + "epoch": 1.7950089126559714, + "grad_norm": 1.6349846124649048, + "learning_rate": 3.808345167874361e-07, + "loss": 0.5887, + "step": 1024 + }, + { + "epoch": 1.7967914438502675, + "grad_norm": 2.258037567138672, + "learning_rate": 3.731504937166841e-07, + "loss": 0.609, + "step": 1025 + }, + { + "epoch": 1.7985739750445633, + "grad_norm": 2.426041603088379, + "learning_rate": 3.6554330829429716e-07, + "loss": 0.6012, + "step": 1026 + }, + { + "epoch": 1.8003565062388591, + "grad_norm": 2.0654661655426025, + "learning_rate": 3.5801302123777524e-07, + "loss": 0.6293, + "step": 1027 + }, + { + "epoch": 1.8021390374331552, + "grad_norm": 1.8019964694976807, + "learning_rate": 3.50559692650847e-07, + "loss": 0.6013, + "step": 1028 + }, + { + "epoch": 1.803921568627451, + "grad_norm": 2.0039703845977783, + "learning_rate": 3.4318338202298796e-07, + "loss": 0.6604, + "step": 1029 + }, + { + "epoch": 1.8057040998217468, + "grad_norm": 2.3911831378936768, + "learning_rate": 3.3588414822895097e-07, + "loss": 0.6017, + "step": 1030 + }, + { + "epoch": 1.807486631016043, + "grad_norm": 1.8550302982330322, + "learning_rate": 3.2866204952828773e-07, + "loss": 0.5959, + "step": 1031 + }, + { + "epoch": 1.8092691622103387, + "grad_norm": 2.110670328140259, + "learning_rate": 3.2151714356489225e-07, + "loss": 0.6788, + "step": 1032 + }, + { + "epoch": 1.8110516934046346, + "grad_norm": 2.0615427494049072, + "learning_rate": 3.1444948736653604e-07, + "loss": 0.6304, + "step": 1033 + }, + { + "epoch": 1.8128342245989306, + "grad_norm": 2.303654432296753, + "learning_rate": 3.0745913734441357e-07, + "loss": 0.5751, + "step": 1034 + }, + { + "epoch": 1.8146167557932262, + "grad_norm": 2.325732946395874, + "learning_rate": 3.005461492926931e-07, + "loss": 0.6593, + "step": 1035 + }, + { + "epoch": 1.8163992869875223, + "grad_norm": 2.306138515472412, + "learning_rate": 2.937105783880689e-07, + "loss": 0.6617, + "step": 1036 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 1.7654507160186768, + "learning_rate": 2.869524791893252e-07, + "loss": 0.6126, + "step": 1037 + }, + { + "epoch": 1.819964349376114, + "grad_norm": 1.7648675441741943, + "learning_rate": 2.8027190563689745e-07, + "loss": 0.6459, + "step": 1038 + }, + { + "epoch": 1.82174688057041, + "grad_norm": 2.3114426136016846, + "learning_rate": 2.736689110524404e-07, + "loss": 0.5672, + "step": 1039 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 2.0545334815979004, + "learning_rate": 2.671435481384066e-07, + "loss": 0.5506, + "step": 1040 + }, + { + "epoch": 1.8253119429590017, + "grad_norm": 1.8540985584259033, + "learning_rate": 2.60695868977624e-07, + "loss": 0.6197, + "step": 1041 + }, + { + "epoch": 1.8270944741532977, + "grad_norm": 1.8356308937072754, + "learning_rate": 2.5432592503288e-07, + "loss": 0.5953, + "step": 1042 + }, + { + "epoch": 1.8288770053475936, + "grad_norm": 2.0690879821777344, + "learning_rate": 2.480337671465083e-07, + "loss": 0.5921, + "step": 1043 + }, + { + "epoch": 1.8306595365418894, + "grad_norm": 1.937947392463684, + "learning_rate": 2.4181944553999026e-07, + "loss": 0.6009, + "step": 1044 + }, + { + "epoch": 1.8324420677361855, + "grad_norm": 2.166762351989746, + "learning_rate": 2.356830098135443e-07, + "loss": 0.6163, + "step": 1045 + }, + { + "epoch": 1.8342245989304813, + "grad_norm": 1.7820336818695068, + "learning_rate": 2.2962450894573606e-07, + "loss": 0.6497, + "step": 1046 + }, + { + "epoch": 1.8360071301247771, + "grad_norm": 1.9238687753677368, + "learning_rate": 2.236439912930899e-07, + "loss": 0.6168, + "step": 1047 + }, + { + "epoch": 1.8377896613190732, + "grad_norm": 2.0929696559906006, + "learning_rate": 2.1774150458969578e-07, + "loss": 0.5113, + "step": 1048 + }, + { + "epoch": 1.839572192513369, + "grad_norm": 2.0725159645080566, + "learning_rate": 2.1191709594683419e-07, + "loss": 0.528, + "step": 1049 + }, + { + "epoch": 1.8413547237076648, + "grad_norm": 2.1145312786102295, + "learning_rate": 2.0617081185259512e-07, + "loss": 0.588, + "step": 1050 + }, + { + "epoch": 1.843137254901961, + "grad_norm": 1.9781309366226196, + "learning_rate": 2.0050269817151413e-07, + "loss": 0.5931, + "step": 1051 + }, + { + "epoch": 1.8449197860962567, + "grad_norm": 2.162130832672119, + "learning_rate": 1.9491280014419689e-07, + "loss": 0.5989, + "step": 1052 + }, + { + "epoch": 1.8467023172905526, + "grad_norm": 1.7542837858200073, + "learning_rate": 1.8940116238696514e-07, + "loss": 0.6411, + "step": 1053 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 2.309709310531616, + "learning_rate": 1.8396782889150144e-07, + "loss": 0.6104, + "step": 1054 + }, + { + "epoch": 1.8502673796791442, + "grad_norm": 1.9894696474075317, + "learning_rate": 1.7861284302449267e-07, + "loss": 0.6426, + "step": 1055 + }, + { + "epoch": 1.8520499108734403, + "grad_norm": 2.1708126068115234, + "learning_rate": 1.7333624752728373e-07, + "loss": 0.5921, + "step": 1056 + }, + { + "epoch": 1.8538324420677363, + "grad_norm": 2.056997060775757, + "learning_rate": 1.6813808451554447e-07, + "loss": 0.6295, + "step": 1057 + }, + { + "epoch": 1.855614973262032, + "grad_norm": 2.0771663188934326, + "learning_rate": 1.630183954789233e-07, + "loss": 0.5634, + "step": 1058 + }, + { + "epoch": 1.857397504456328, + "grad_norm": 1.957069993019104, + "learning_rate": 1.5797722128072514e-07, + "loss": 0.6253, + "step": 1059 + }, + { + "epoch": 1.8591800356506238, + "grad_norm": 2.001437187194824, + "learning_rate": 1.5301460215757625e-07, + "loss": 0.6211, + "step": 1060 + }, + { + "epoch": 1.8609625668449197, + "grad_norm": 2.0400633811950684, + "learning_rate": 1.481305777191133e-07, + "loss": 0.5816, + "step": 1061 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 1.9419509172439575, + "learning_rate": 1.4332518694765708e-07, + "loss": 0.6496, + "step": 1062 + }, + { + "epoch": 1.8645276292335116, + "grad_norm": 2.0088133811950684, + "learning_rate": 1.385984681979069e-07, + "loss": 0.6479, + "step": 1063 + }, + { + "epoch": 1.8663101604278074, + "grad_norm": 2.402581214904785, + "learning_rate": 1.3395045919663674e-07, + "loss": 0.6767, + "step": 1064 + }, + { + "epoch": 1.8680926916221035, + "grad_norm": 2.071725845336914, + "learning_rate": 1.293811970423864e-07, + "loss": 0.5917, + "step": 1065 + }, + { + "epoch": 1.8698752228163993, + "grad_norm": 1.871656060218811, + "learning_rate": 1.2489071820517394e-07, + "loss": 0.5651, + "step": 1066 + }, + { + "epoch": 1.8716577540106951, + "grad_norm": 2.2825636863708496, + "learning_rate": 1.2047905852619834e-07, + "loss": 0.6595, + "step": 1067 + }, + { + "epoch": 1.8734402852049912, + "grad_norm": 2.1393330097198486, + "learning_rate": 1.1614625321755613e-07, + "loss": 0.6219, + "step": 1068 + }, + { + "epoch": 1.875222816399287, + "grad_norm": 2.1348400115966797, + "learning_rate": 1.1189233686195956e-07, + "loss": 0.5991, + "step": 1069 + }, + { + "epoch": 1.8770053475935828, + "grad_norm": 1.983951210975647, + "learning_rate": 1.0771734341246121e-07, + "loss": 0.565, + "step": 1070 + }, + { + "epoch": 1.878787878787879, + "grad_norm": 1.8638139963150024, + "learning_rate": 1.0362130619218424e-07, + "loss": 0.6242, + "step": 1071 + }, + { + "epoch": 1.8805704099821747, + "grad_norm": 2.2923965454101562, + "learning_rate": 9.96042578940526e-08, + "loss": 0.5726, + "step": 1072 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 2.1189980506896973, + "learning_rate": 9.566623058053337e-08, + "loss": 0.5842, + "step": 1073 + }, + { + "epoch": 1.8841354723707666, + "grad_norm": 1.616305947303772, + "learning_rate": 9.180725568338045e-08, + "loss": 0.6361, + "step": 1074 + }, + { + "epoch": 1.8859180035650622, + "grad_norm": 1.9860106706619263, + "learning_rate": 8.802736400338019e-08, + "loss": 0.6659, + "step": 1075 + }, + { + "epoch": 1.8877005347593583, + "grad_norm": 2.3532490730285645, + "learning_rate": 8.432658571011387e-08, + "loss": 0.6402, + "step": 1076 + }, + { + "epoch": 1.8894830659536543, + "grad_norm": 2.0499911308288574, + "learning_rate": 8.070495034170566e-08, + "loss": 0.6302, + "step": 1077 + }, + { + "epoch": 1.89126559714795, + "grad_norm": 2.055406332015991, + "learning_rate": 7.716248680459726e-08, + "loss": 0.5792, + "step": 1078 + }, + { + "epoch": 1.893048128342246, + "grad_norm": 1.9370893239974976, + "learning_rate": 7.369922337330914e-08, + "loss": 0.6076, + "step": 1079 + }, + { + "epoch": 1.8948306595365418, + "grad_norm": 2.0450425148010254, + "learning_rate": 7.031518769021972e-08, + "loss": 0.6294, + "step": 1080 + }, + { + "epoch": 1.8966131907308377, + "grad_norm": 1.9761147499084473, + "learning_rate": 6.701040676534432e-08, + "loss": 0.6191, + "step": 1081 + }, + { + "epoch": 1.8983957219251337, + "grad_norm": 2.1664581298828125, + "learning_rate": 6.378490697611761e-08, + "loss": 0.5854, + "step": 1082 + }, + { + "epoch": 1.9001782531194296, + "grad_norm": 2.0368752479553223, + "learning_rate": 6.063871406718381e-08, + "loss": 0.5508, + "step": 1083 + }, + { + "epoch": 1.9019607843137254, + "grad_norm": 2.349900007247925, + "learning_rate": 5.75718531501912e-08, + "loss": 0.6304, + "step": 1084 + }, + { + "epoch": 1.9037433155080214, + "grad_norm": 2.0724315643310547, + "learning_rate": 5.4584348703594634e-08, + "loss": 0.6062, + "step": 1085 + }, + { + "epoch": 1.9055258467023173, + "grad_norm": 2.279808282852173, + "learning_rate": 5.1676224572452246e-08, + "loss": 0.6588, + "step": 1086 + }, + { + "epoch": 1.9073083778966131, + "grad_norm": 1.9012436866760254, + "learning_rate": 4.884750396824567e-08, + "loss": 0.6473, + "step": 1087 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 1.8943966627120972, + "learning_rate": 4.609820946868682e-08, + "loss": 0.595, + "step": 1088 + }, + { + "epoch": 1.910873440285205, + "grad_norm": 2.4004600048065186, + "learning_rate": 4.3428363017540276e-08, + "loss": 0.6061, + "step": 1089 + }, + { + "epoch": 1.9126559714795008, + "grad_norm": 2.035853624343872, + "learning_rate": 4.083798592444899e-08, + "loss": 0.5977, + "step": 1090 + }, + { + "epoch": 1.914438502673797, + "grad_norm": 1.916986107826233, + "learning_rate": 3.832709886476438e-08, + "loss": 0.5385, + "step": 1091 + }, + { + "epoch": 1.9162210338680927, + "grad_norm": 1.8782941102981567, + "learning_rate": 3.589572187937651e-08, + "loss": 0.5656, + "step": 1092 + }, + { + "epoch": 1.9180035650623886, + "grad_norm": 2.144084930419922, + "learning_rate": 3.354387437456197e-08, + "loss": 0.5861, + "step": 1093 + }, + { + "epoch": 1.9197860962566846, + "grad_norm": 2.3000340461730957, + "learning_rate": 3.127157512182288e-08, + "loss": 0.5849, + "step": 1094 + }, + { + "epoch": 1.9215686274509802, + "grad_norm": 2.2352511882781982, + "learning_rate": 2.907884225774149e-08, + "loss": 0.6817, + "step": 1095 + }, + { + "epoch": 1.9233511586452763, + "grad_norm": 1.7509145736694336, + "learning_rate": 2.6965693283829143e-08, + "loss": 0.5748, + "step": 1096 + }, + { + "epoch": 1.9251336898395723, + "grad_norm": 2.180359363555908, + "learning_rate": 2.4932145066394186e-08, + "loss": 0.593, + "step": 1097 + }, + { + "epoch": 1.926916221033868, + "grad_norm": 2.239539623260498, + "learning_rate": 2.2978213836400974e-08, + "loss": 0.5085, + "step": 1098 + }, + { + "epoch": 1.928698752228164, + "grad_norm": 2.2124013900756836, + "learning_rate": 2.1103915189344403e-08, + "loss": 0.5729, + "step": 1099 + }, + { + "epoch": 1.93048128342246, + "grad_norm": 2.1283504962921143, + "learning_rate": 1.9309264085124458e-08, + "loss": 0.6068, + "step": 1100 + }, + { + "epoch": 1.9322638146167557, + "grad_norm": 2.4210431575775146, + "learning_rate": 1.7594274847926306e-08, + "loss": 0.6128, + "step": 1101 + }, + { + "epoch": 1.9340463458110517, + "grad_norm": 2.339913845062256, + "learning_rate": 1.5958961166104847e-08, + "loss": 0.5487, + "step": 1102 + }, + { + "epoch": 1.9358288770053476, + "grad_norm": 1.8890469074249268, + "learning_rate": 1.4403336092077002e-08, + "loss": 0.6371, + "step": 1103 + }, + { + "epoch": 1.9376114081996434, + "grad_norm": 2.159550666809082, + "learning_rate": 1.2927412042218479e-08, + "loss": 0.589, + "step": 1104 + }, + { + "epoch": 1.9393939393939394, + "grad_norm": 2.154710292816162, + "learning_rate": 1.1531200796762731e-08, + "loss": 0.607, + "step": 1105 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 2.176748275756836, + "learning_rate": 1.0214713499706596e-08, + "loss": 0.6424, + "step": 1106 + }, + { + "epoch": 1.9429590017825311, + "grad_norm": 2.3147072792053223, + "learning_rate": 8.977960658723694e-09, + "loss": 0.674, + "step": 1107 + }, + { + "epoch": 1.9447415329768272, + "grad_norm": 2.0270538330078125, + "learning_rate": 7.820952145078942e-09, + "loss": 0.6114, + "step": 1108 + }, + { + "epoch": 1.946524064171123, + "grad_norm": 1.8115181922912598, + "learning_rate": 6.743697193549725e-09, + "loss": 0.6498, + "step": 1109 + }, + { + "epoch": 1.9483065953654188, + "grad_norm": 1.941896677017212, + "learning_rate": 5.7462044023515186e-09, + "loss": 0.6068, + "step": 1110 + }, + { + "epoch": 1.950089126559715, + "grad_norm": 2.0113778114318848, + "learning_rate": 4.828481733073487e-09, + "loss": 0.5983, + "step": 1111 + }, + { + "epoch": 1.9518716577540107, + "grad_norm": 2.019991159439087, + "learning_rate": 3.9905365106085445e-09, + "loss": 0.5958, + "step": 1112 + }, + { + "epoch": 1.9536541889483066, + "grad_norm": 2.064058303833008, + "learning_rate": 3.2323754230989546e-09, + "loss": 0.6223, + "step": 1113 + }, + { + "epoch": 1.9554367201426026, + "grad_norm": 2.038113594055176, + "learning_rate": 2.5540045218819256e-09, + "loss": 0.6118, + "step": 1114 + }, + { + "epoch": 1.9572192513368984, + "grad_norm": 2.5111892223358154, + "learning_rate": 1.9554292214418734e-09, + "loss": 0.5805, + "step": 1115 + }, + { + "epoch": 1.9590017825311943, + "grad_norm": 2.1744213104248047, + "learning_rate": 1.436654299367124e-09, + "loss": 0.6322, + "step": 1116 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 1.8781476020812988, + "learning_rate": 9.976838963099421e-10, + "loss": 0.596, + "step": 1117 + }, + { + "epoch": 1.962566844919786, + "grad_norm": 2.229332447052002, + "learning_rate": 6.385215159565583e-10, + "loss": 0.6091, + "step": 1118 + }, + { + "epoch": 1.964349376114082, + "grad_norm": 2.3374135494232178, + "learning_rate": 3.5917002499719076e-10, + "loss": 0.5855, + "step": 1119 + }, + { + "epoch": 1.966131907308378, + "grad_norm": 1.931396722793579, + "learning_rate": 1.59631653102732e-10, + "loss": 0.6366, + "step": 1120 + }, + { + "epoch": 1.9679144385026737, + "grad_norm": 1.9510762691497803, + "learning_rate": 3.990799290809477e-11, + "loss": 0.638, + "step": 1121 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 2.2144649028778076, + "learning_rate": 0.0, + "loss": 0.554, + "step": 1122 + } + ], + "logging_steps": 1, + "max_steps": 1122, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 281, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.428572157668229e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}