diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,23748 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9993349589891376, + "eval_steps": 564, + "global_step": 3381, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008867213478164486, + "grad_norm": 105.35820770263672, + "learning_rate": 1.0000000000000001e-07, + "loss": 3.2849, + "step": 1 + }, + { + "epoch": 0.0008867213478164486, + "eval_loss": 3.391964912414551, + "eval_runtime": 71.6314, + "eval_samples_per_second": 2.68, + "eval_steps_per_second": 0.67, + "step": 1 + }, + { + "epoch": 0.0017734426956328973, + "grad_norm": 97.53443908691406, + "learning_rate": 2.0000000000000002e-07, + "loss": 3.1563, + "step": 2 + }, + { + "epoch": 0.002660164043449346, + "grad_norm": 101.90808868408203, + "learning_rate": 3.0000000000000004e-07, + "loss": 3.2752, + "step": 3 + }, + { + "epoch": 0.0035468853912657946, + "grad_norm": 99.9984359741211, + "learning_rate": 4.0000000000000003e-07, + "loss": 3.2379, + "step": 4 + }, + { + "epoch": 0.004433606739082243, + "grad_norm": 93.44331359863281, + "learning_rate": 5.000000000000001e-07, + "loss": 3.0293, + "step": 5 + }, + { + "epoch": 0.005320328086898692, + "grad_norm": 98.4569091796875, + "learning_rate": 6.000000000000001e-07, + "loss": 3.2555, + "step": 6 + }, + { + "epoch": 0.006207049434715141, + "grad_norm": 96.81407165527344, + "learning_rate": 7.000000000000001e-07, + "loss": 3.2231, + "step": 7 + }, + { + "epoch": 0.007093770782531589, + "grad_norm": 81.63873291015625, + "learning_rate": 8.000000000000001e-07, + "loss": 2.8762, + "step": 8 + }, + { + "epoch": 0.007980492130348038, + "grad_norm": 94.48062896728516, + "learning_rate": 9.000000000000001e-07, + "loss": 3.1835, + "step": 9 + }, + { + "epoch": 0.008867213478164486, + "grad_norm": 82.22791290283203, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.9473, + "step": 10 + }, + { + "epoch": 0.009753934825980935, + "grad_norm": 66.42625427246094, + "learning_rate": 1.1e-06, + "loss": 2.819, + "step": 11 + }, + { + "epoch": 0.010640656173797385, + "grad_norm": 63.793827056884766, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.8275, + "step": 12 + }, + { + "epoch": 0.011527377521613832, + "grad_norm": 55.935733795166016, + "learning_rate": 1.3e-06, + "loss": 2.6193, + "step": 13 + }, + { + "epoch": 0.012414098869430281, + "grad_norm": 37.10558319091797, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.2145, + "step": 14 + }, + { + "epoch": 0.01330082021724673, + "grad_norm": 31.75094985961914, + "learning_rate": 1.5e-06, + "loss": 2.0544, + "step": 15 + }, + { + "epoch": 0.014187541565063178, + "grad_norm": 33.343467712402344, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.0847, + "step": 16 + }, + { + "epoch": 0.015074262912879628, + "grad_norm": 30.80988311767578, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.9482, + "step": 17 + }, + { + "epoch": 0.015960984260696077, + "grad_norm": 29.666048049926758, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.8699, + "step": 18 + }, + { + "epoch": 0.016847705608512526, + "grad_norm": 29.294734954833984, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.4459, + "step": 19 + }, + { + "epoch": 0.017734426956328972, + "grad_norm": 31.5433292388916, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.2445, + "step": 20 + }, + { + "epoch": 0.01862114830414542, + "grad_norm": 33.008087158203125, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.2146, + "step": 21 + }, + { + "epoch": 0.01950786965196187, + "grad_norm": 30.054840087890625, + "learning_rate": 2.2e-06, + "loss": 1.0768, + "step": 22 + }, + { + "epoch": 0.02039459099977832, + "grad_norm": 34.325687408447266, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.0071, + "step": 23 + }, + { + "epoch": 0.02128131234759477, + "grad_norm": 37.760826110839844, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.7746, + "step": 24 + }, + { + "epoch": 0.02216803369541122, + "grad_norm": 32.44926834106445, + "learning_rate": 2.5e-06, + "loss": 0.583, + "step": 25 + }, + { + "epoch": 0.023054755043227664, + "grad_norm": 43.93800735473633, + "learning_rate": 2.6e-06, + "loss": 0.311, + "step": 26 + }, + { + "epoch": 0.023941476391044113, + "grad_norm": 34.61675262451172, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.2481, + "step": 27 + }, + { + "epoch": 0.024828197738860563, + "grad_norm": 16.67363166809082, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.1261, + "step": 28 + }, + { + "epoch": 0.025714919086677012, + "grad_norm": 16.36240005493164, + "learning_rate": 2.9e-06, + "loss": 0.1044, + "step": 29 + }, + { + "epoch": 0.02660164043449346, + "grad_norm": 13.118659019470215, + "learning_rate": 3e-06, + "loss": 0.1006, + "step": 30 + }, + { + "epoch": 0.02748836178230991, + "grad_norm": 15.39649486541748, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.1069, + "step": 31 + }, + { + "epoch": 0.028375083130126356, + "grad_norm": 14.789741516113281, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.113, + "step": 32 + }, + { + "epoch": 0.029261804477942806, + "grad_norm": 8.225541114807129, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.0753, + "step": 33 + }, + { + "epoch": 0.030148525825759255, + "grad_norm": 5.805243968963623, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0788, + "step": 34 + }, + { + "epoch": 0.031035247173575704, + "grad_norm": 9.23648738861084, + "learning_rate": 3.5e-06, + "loss": 0.0662, + "step": 35 + }, + { + "epoch": 0.031921968521392154, + "grad_norm": 3.07475209236145, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0452, + "step": 36 + }, + { + "epoch": 0.0328086898692086, + "grad_norm": 18.152185440063477, + "learning_rate": 3.7e-06, + "loss": 0.0722, + "step": 37 + }, + { + "epoch": 0.03369541121702505, + "grad_norm": 8.058207511901855, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0592, + "step": 38 + }, + { + "epoch": 0.0345821325648415, + "grad_norm": 6.9130353927612305, + "learning_rate": 3.900000000000001e-06, + "loss": 0.0616, + "step": 39 + }, + { + "epoch": 0.035468853912657944, + "grad_norm": 6.1149516105651855, + "learning_rate": 4.000000000000001e-06, + "loss": 0.0545, + "step": 40 + }, + { + "epoch": 0.03635557526047439, + "grad_norm": 2.7555227279663086, + "learning_rate": 4.1e-06, + "loss": 0.05, + "step": 41 + }, + { + "epoch": 0.03724229660829084, + "grad_norm": 7.199471950531006, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0536, + "step": 42 + }, + { + "epoch": 0.03812901795610729, + "grad_norm": 5.784890174865723, + "learning_rate": 4.3e-06, + "loss": 0.0534, + "step": 43 + }, + { + "epoch": 0.03901573930392374, + "grad_norm": 2.663280487060547, + "learning_rate": 4.4e-06, + "loss": 0.0459, + "step": 44 + }, + { + "epoch": 0.03990246065174019, + "grad_norm": 2.6677751541137695, + "learning_rate": 4.5e-06, + "loss": 0.0427, + "step": 45 + }, + { + "epoch": 0.04078918199955664, + "grad_norm": 6.349153518676758, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0711, + "step": 46 + }, + { + "epoch": 0.04167590334737309, + "grad_norm": 2.082395076751709, + "learning_rate": 4.7e-06, + "loss": 0.0546, + "step": 47 + }, + { + "epoch": 0.04256262469518954, + "grad_norm": 3.0053043365478516, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0461, + "step": 48 + }, + { + "epoch": 0.04344934604300599, + "grad_norm": 4.920153617858887, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.0537, + "step": 49 + }, + { + "epoch": 0.04433606739082244, + "grad_norm": 1.2779656648635864, + "learning_rate": 5e-06, + "loss": 0.0419, + "step": 50 + }, + { + "epoch": 0.045222788738638886, + "grad_norm": 2.678823471069336, + "learning_rate": 5.1e-06, + "loss": 0.0385, + "step": 51 + }, + { + "epoch": 0.04610951008645533, + "grad_norm": 2.920421600341797, + "learning_rate": 5.2e-06, + "loss": 0.0518, + "step": 52 + }, + { + "epoch": 0.04699623143427178, + "grad_norm": 2.3334977626800537, + "learning_rate": 5.300000000000001e-06, + "loss": 0.0397, + "step": 53 + }, + { + "epoch": 0.04788295278208823, + "grad_norm": 2.648272752761841, + "learning_rate": 5.400000000000001e-06, + "loss": 0.0374, + "step": 54 + }, + { + "epoch": 0.048769674129904676, + "grad_norm": 6.664809703826904, + "learning_rate": 5.500000000000001e-06, + "loss": 0.052, + "step": 55 + }, + { + "epoch": 0.049656395477721126, + "grad_norm": 2.4327173233032227, + "learning_rate": 5.600000000000001e-06, + "loss": 0.0272, + "step": 56 + }, + { + "epoch": 0.050543116825537575, + "grad_norm": 1.9386110305786133, + "learning_rate": 5.7e-06, + "loss": 0.0285, + "step": 57 + }, + { + "epoch": 0.051429838173354024, + "grad_norm": 2.0530142784118652, + "learning_rate": 5.8e-06, + "loss": 0.0353, + "step": 58 + }, + { + "epoch": 0.05231655952117047, + "grad_norm": 1.730095624923706, + "learning_rate": 5.9e-06, + "loss": 0.0393, + "step": 59 + }, + { + "epoch": 0.05320328086898692, + "grad_norm": 1.3259291648864746, + "learning_rate": 6e-06, + "loss": 0.0245, + "step": 60 + }, + { + "epoch": 0.05409000221680337, + "grad_norm": 2.2212586402893066, + "learning_rate": 6.1e-06, + "loss": 0.038, + "step": 61 + }, + { + "epoch": 0.05497672356461982, + "grad_norm": 2.2513837814331055, + "learning_rate": 6.200000000000001e-06, + "loss": 0.0351, + "step": 62 + }, + { + "epoch": 0.055863444912436264, + "grad_norm": 1.917279601097107, + "learning_rate": 6.300000000000001e-06, + "loss": 0.0277, + "step": 63 + }, + { + "epoch": 0.05675016626025271, + "grad_norm": 4.1914544105529785, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.0375, + "step": 64 + }, + { + "epoch": 0.05763688760806916, + "grad_norm": 2.5768065452575684, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.0366, + "step": 65 + }, + { + "epoch": 0.05852360895588561, + "grad_norm": 2.6516637802124023, + "learning_rate": 6.600000000000001e-06, + "loss": 0.0259, + "step": 66 + }, + { + "epoch": 0.05941033030370206, + "grad_norm": 6.144593238830566, + "learning_rate": 6.700000000000001e-06, + "loss": 0.0416, + "step": 67 + }, + { + "epoch": 0.06029705165151851, + "grad_norm": 2.34024977684021, + "learning_rate": 6.800000000000001e-06, + "loss": 0.0236, + "step": 68 + }, + { + "epoch": 0.06118377299933496, + "grad_norm": 1.2529782056808472, + "learning_rate": 6.9e-06, + "loss": 0.0205, + "step": 69 + }, + { + "epoch": 0.06207049434715141, + "grad_norm": 2.8280766010284424, + "learning_rate": 7e-06, + "loss": 0.024, + "step": 70 + }, + { + "epoch": 0.06295721569496786, + "grad_norm": 2.5235207080841064, + "learning_rate": 7.100000000000001e-06, + "loss": 0.046, + "step": 71 + }, + { + "epoch": 0.06384393704278431, + "grad_norm": 2.7281038761138916, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.036, + "step": 72 + }, + { + "epoch": 0.06473065839060076, + "grad_norm": 1.456838846206665, + "learning_rate": 7.3e-06, + "loss": 0.0224, + "step": 73 + }, + { + "epoch": 0.0656173797384172, + "grad_norm": 2.595977306365967, + "learning_rate": 7.4e-06, + "loss": 0.0377, + "step": 74 + }, + { + "epoch": 0.06650410108623366, + "grad_norm": 1.190896987915039, + "learning_rate": 7.500000000000001e-06, + "loss": 0.0244, + "step": 75 + }, + { + "epoch": 0.0673908224340501, + "grad_norm": 3.9588253498077393, + "learning_rate": 7.600000000000001e-06, + "loss": 0.0471, + "step": 76 + }, + { + "epoch": 0.06827754378186655, + "grad_norm": 1.1399868726730347, + "learning_rate": 7.7e-06, + "loss": 0.0167, + "step": 77 + }, + { + "epoch": 0.069164265129683, + "grad_norm": 2.5425806045532227, + "learning_rate": 7.800000000000002e-06, + "loss": 0.0395, + "step": 78 + }, + { + "epoch": 0.07005098647749945, + "grad_norm": 1.4164979457855225, + "learning_rate": 7.9e-06, + "loss": 0.015, + "step": 79 + }, + { + "epoch": 0.07093770782531589, + "grad_norm": 2.7378995418548584, + "learning_rate": 8.000000000000001e-06, + "loss": 0.0232, + "step": 80 + }, + { + "epoch": 0.07182442917313234, + "grad_norm": 1.0994404554367065, + "learning_rate": 8.1e-06, + "loss": 0.014, + "step": 81 + }, + { + "epoch": 0.07271115052094879, + "grad_norm": 1.4709571599960327, + "learning_rate": 8.2e-06, + "loss": 0.0347, + "step": 82 + }, + { + "epoch": 0.07359787186876524, + "grad_norm": 1.0280840396881104, + "learning_rate": 8.3e-06, + "loss": 0.0181, + "step": 83 + }, + { + "epoch": 0.07448459321658168, + "grad_norm": 4.092020034790039, + "learning_rate": 8.400000000000001e-06, + "loss": 0.0395, + "step": 84 + }, + { + "epoch": 0.07537131456439813, + "grad_norm": 3.000915050506592, + "learning_rate": 8.5e-06, + "loss": 0.0356, + "step": 85 + }, + { + "epoch": 0.07625803591221458, + "grad_norm": 1.4135894775390625, + "learning_rate": 8.6e-06, + "loss": 0.0283, + "step": 86 + }, + { + "epoch": 0.07714475726003103, + "grad_norm": 2.2624311447143555, + "learning_rate": 8.700000000000001e-06, + "loss": 0.0329, + "step": 87 + }, + { + "epoch": 0.07803147860784748, + "grad_norm": 3.6346747875213623, + "learning_rate": 8.8e-06, + "loss": 0.0394, + "step": 88 + }, + { + "epoch": 0.07891819995566393, + "grad_norm": 6.760399341583252, + "learning_rate": 8.900000000000001e-06, + "loss": 0.0298, + "step": 89 + }, + { + "epoch": 0.07980492130348038, + "grad_norm": 2.7166779041290283, + "learning_rate": 9e-06, + "loss": 0.0364, + "step": 90 + }, + { + "epoch": 0.08069164265129683, + "grad_norm": 1.9583271741867065, + "learning_rate": 9.100000000000001e-06, + "loss": 0.0257, + "step": 91 + }, + { + "epoch": 0.08157836399911328, + "grad_norm": 0.892090916633606, + "learning_rate": 9.200000000000002e-06, + "loss": 0.0242, + "step": 92 + }, + { + "epoch": 0.08246508534692973, + "grad_norm": 2.415807008743286, + "learning_rate": 9.3e-06, + "loss": 0.022, + "step": 93 + }, + { + "epoch": 0.08335180669474618, + "grad_norm": 1.4595792293548584, + "learning_rate": 9.4e-06, + "loss": 0.0227, + "step": 94 + }, + { + "epoch": 0.08423852804256263, + "grad_norm": 2.336437702178955, + "learning_rate": 9.5e-06, + "loss": 0.0395, + "step": 95 + }, + { + "epoch": 0.08512524939037908, + "grad_norm": 3.5256662368774414, + "learning_rate": 9.600000000000001e-06, + "loss": 0.0396, + "step": 96 + }, + { + "epoch": 0.08601197073819553, + "grad_norm": 2.1209492683410645, + "learning_rate": 9.7e-06, + "loss": 0.0289, + "step": 97 + }, + { + "epoch": 0.08689869208601197, + "grad_norm": 1.9491901397705078, + "learning_rate": 9.800000000000001e-06, + "loss": 0.0299, + "step": 98 + }, + { + "epoch": 0.08778541343382842, + "grad_norm": 1.8206737041473389, + "learning_rate": 9.9e-06, + "loss": 0.0285, + "step": 99 + }, + { + "epoch": 0.08867213478164487, + "grad_norm": 3.291557788848877, + "learning_rate": 1e-05, + "loss": 0.0391, + "step": 100 + }, + { + "epoch": 0.08955885612946132, + "grad_norm": 2.652033805847168, + "learning_rate": 9.999997707933255e-06, + "loss": 0.0321, + "step": 101 + }, + { + "epoch": 0.09044557747727777, + "grad_norm": 0.7913987040519714, + "learning_rate": 9.999990831735122e-06, + "loss": 0.0211, + "step": 102 + }, + { + "epoch": 0.09133229882509421, + "grad_norm": 2.8626115322113037, + "learning_rate": 9.999979371411906e-06, + "loss": 0.0317, + "step": 103 + }, + { + "epoch": 0.09221902017291066, + "grad_norm": 1.934716820716858, + "learning_rate": 9.999963326974111e-06, + "loss": 0.0227, + "step": 104 + }, + { + "epoch": 0.0931057415207271, + "grad_norm": 1.4647551774978638, + "learning_rate": 9.999942698436452e-06, + "loss": 0.0253, + "step": 105 + }, + { + "epoch": 0.09399246286854356, + "grad_norm": 1.2313497066497803, + "learning_rate": 9.999917485817836e-06, + "loss": 0.0291, + "step": 106 + }, + { + "epoch": 0.09487918421636, + "grad_norm": 3.5174624919891357, + "learning_rate": 9.999887689141383e-06, + "loss": 0.0495, + "step": 107 + }, + { + "epoch": 0.09576590556417645, + "grad_norm": 3.038734197616577, + "learning_rate": 9.99985330843441e-06, + "loss": 0.0317, + "step": 108 + }, + { + "epoch": 0.0966526269119929, + "grad_norm": 1.4445629119873047, + "learning_rate": 9.999814343728439e-06, + "loss": 0.0235, + "step": 109 + }, + { + "epoch": 0.09753934825980935, + "grad_norm": 1.7566113471984863, + "learning_rate": 9.999770795059189e-06, + "loss": 0.0268, + "step": 110 + }, + { + "epoch": 0.0984260696076258, + "grad_norm": 2.2457563877105713, + "learning_rate": 9.999722662466594e-06, + "loss": 0.0287, + "step": 111 + }, + { + "epoch": 0.09931279095544225, + "grad_norm": 2.0211808681488037, + "learning_rate": 9.999669945994778e-06, + "loss": 0.0241, + "step": 112 + }, + { + "epoch": 0.1001995123032587, + "grad_norm": 1.2760939598083496, + "learning_rate": 9.999612645692077e-06, + "loss": 0.0212, + "step": 113 + }, + { + "epoch": 0.10108623365107515, + "grad_norm": 0.9905439019203186, + "learning_rate": 9.999550761611021e-06, + "loss": 0.0195, + "step": 114 + }, + { + "epoch": 0.1019729549988916, + "grad_norm": 1.5984809398651123, + "learning_rate": 9.999484293808351e-06, + "loss": 0.0175, + "step": 115 + }, + { + "epoch": 0.10285967634670805, + "grad_norm": 0.9621034860610962, + "learning_rate": 9.999413242345001e-06, + "loss": 0.0132, + "step": 116 + }, + { + "epoch": 0.1037463976945245, + "grad_norm": 1.1000981330871582, + "learning_rate": 9.99933760728612e-06, + "loss": 0.0158, + "step": 117 + }, + { + "epoch": 0.10463311904234095, + "grad_norm": 0.9676356911659241, + "learning_rate": 9.999257388701046e-06, + "loss": 0.0227, + "step": 118 + }, + { + "epoch": 0.1055198403901574, + "grad_norm": 2.129136323928833, + "learning_rate": 9.999172586663331e-06, + "loss": 0.028, + "step": 119 + }, + { + "epoch": 0.10640656173797385, + "grad_norm": 2.1270437240600586, + "learning_rate": 9.99908320125072e-06, + "loss": 0.0169, + "step": 120 + }, + { + "epoch": 0.1072932830857903, + "grad_norm": 2.0533697605133057, + "learning_rate": 9.998989232545164e-06, + "loss": 0.024, + "step": 121 + }, + { + "epoch": 0.10818000443360674, + "grad_norm": 2.021212577819824, + "learning_rate": 9.998890680632818e-06, + "loss": 0.0236, + "step": 122 + }, + { + "epoch": 0.1090667257814232, + "grad_norm": 3.157949686050415, + "learning_rate": 9.998787545604034e-06, + "loss": 0.0125, + "step": 123 + }, + { + "epoch": 0.10995344712923964, + "grad_norm": 1.5896356105804443, + "learning_rate": 9.998679827553372e-06, + "loss": 0.0222, + "step": 124 + }, + { + "epoch": 0.11084016847705609, + "grad_norm": 1.598581075668335, + "learning_rate": 9.99856752657959e-06, + "loss": 0.0271, + "step": 125 + }, + { + "epoch": 0.11172688982487253, + "grad_norm": 0.9781515598297119, + "learning_rate": 9.998450642785649e-06, + "loss": 0.0122, + "step": 126 + }, + { + "epoch": 0.11261361117268898, + "grad_norm": 1.5786858797073364, + "learning_rate": 9.99832917627871e-06, + "loss": 0.0219, + "step": 127 + }, + { + "epoch": 0.11350033252050543, + "grad_norm": 1.4831783771514893, + "learning_rate": 9.998203127170137e-06, + "loss": 0.0215, + "step": 128 + }, + { + "epoch": 0.11438705386832188, + "grad_norm": 2.904634952545166, + "learning_rate": 9.998072495575493e-06, + "loss": 0.0269, + "step": 129 + }, + { + "epoch": 0.11527377521613832, + "grad_norm": 1.4405618906021118, + "learning_rate": 9.997937281614549e-06, + "loss": 0.0281, + "step": 130 + }, + { + "epoch": 0.11616049656395477, + "grad_norm": 1.347447156906128, + "learning_rate": 9.99779748541127e-06, + "loss": 0.0225, + "step": 131 + }, + { + "epoch": 0.11704721791177122, + "grad_norm": 0.973578691482544, + "learning_rate": 9.997653107093825e-06, + "loss": 0.0232, + "step": 132 + }, + { + "epoch": 0.11793393925958767, + "grad_norm": 1.3047363758087158, + "learning_rate": 9.997504146794587e-06, + "loss": 0.0279, + "step": 133 + }, + { + "epoch": 0.11882066060740412, + "grad_norm": 1.9343928098678589, + "learning_rate": 9.997350604650123e-06, + "loss": 0.0334, + "step": 134 + }, + { + "epoch": 0.11970738195522057, + "grad_norm": 0.9066912531852722, + "learning_rate": 9.997192480801203e-06, + "loss": 0.0143, + "step": 135 + }, + { + "epoch": 0.12059410330303702, + "grad_norm": 1.209761381149292, + "learning_rate": 9.997029775392805e-06, + "loss": 0.0216, + "step": 136 + }, + { + "epoch": 0.12148082465085347, + "grad_norm": 2.670605182647705, + "learning_rate": 9.996862488574098e-06, + "loss": 0.0264, + "step": 137 + }, + { + "epoch": 0.12236754599866992, + "grad_norm": 1.5146721601486206, + "learning_rate": 9.996690620498454e-06, + "loss": 0.0262, + "step": 138 + }, + { + "epoch": 0.12325426734648637, + "grad_norm": 2.121723175048828, + "learning_rate": 9.996514171323449e-06, + "loss": 0.0345, + "step": 139 + }, + { + "epoch": 0.12414098869430282, + "grad_norm": 0.9533706307411194, + "learning_rate": 9.996333141210855e-06, + "loss": 0.0192, + "step": 140 + }, + { + "epoch": 0.12502771004211927, + "grad_norm": 0.7458922863006592, + "learning_rate": 9.996147530326645e-06, + "loss": 0.0172, + "step": 141 + }, + { + "epoch": 0.12591443138993572, + "grad_norm": 1.2241960763931274, + "learning_rate": 9.995957338840993e-06, + "loss": 0.0152, + "step": 142 + }, + { + "epoch": 0.12680115273775217, + "grad_norm": 1.9640858173370361, + "learning_rate": 9.99576256692827e-06, + "loss": 0.0206, + "step": 143 + }, + { + "epoch": 0.12768787408556861, + "grad_norm": 0.6544690728187561, + "learning_rate": 9.99556321476705e-06, + "loss": 0.0105, + "step": 144 + }, + { + "epoch": 0.12857459543338506, + "grad_norm": 0.9590530395507812, + "learning_rate": 9.995359282540105e-06, + "loss": 0.0212, + "step": 145 + }, + { + "epoch": 0.1294613167812015, + "grad_norm": 1.0109769105911255, + "learning_rate": 9.995150770434401e-06, + "loss": 0.0105, + "step": 146 + }, + { + "epoch": 0.13034803812901796, + "grad_norm": 1.1498576402664185, + "learning_rate": 9.994937678641113e-06, + "loss": 0.0255, + "step": 147 + }, + { + "epoch": 0.1312347594768344, + "grad_norm": 1.3174915313720703, + "learning_rate": 9.994720007355604e-06, + "loss": 0.0188, + "step": 148 + }, + { + "epoch": 0.13212148082465086, + "grad_norm": 2.1449038982391357, + "learning_rate": 9.994497756777445e-06, + "loss": 0.0254, + "step": 149 + }, + { + "epoch": 0.1330082021724673, + "grad_norm": 2.002474784851074, + "learning_rate": 9.994270927110402e-06, + "loss": 0.0178, + "step": 150 + }, + { + "epoch": 0.13389492352028376, + "grad_norm": 3.6749556064605713, + "learning_rate": 9.994039518562433e-06, + "loss": 0.0374, + "step": 151 + }, + { + "epoch": 0.1347816448681002, + "grad_norm": 0.846824049949646, + "learning_rate": 9.993803531345703e-06, + "loss": 0.0239, + "step": 152 + }, + { + "epoch": 0.13566836621591666, + "grad_norm": 1.4229305982589722, + "learning_rate": 9.993562965676572e-06, + "loss": 0.0242, + "step": 153 + }, + { + "epoch": 0.1365550875637331, + "grad_norm": 3.022684097290039, + "learning_rate": 9.993317821775597e-06, + "loss": 0.0362, + "step": 154 + }, + { + "epoch": 0.13744180891154956, + "grad_norm": 1.6719801425933838, + "learning_rate": 9.99306809986753e-06, + "loss": 0.0212, + "step": 155 + }, + { + "epoch": 0.138328530259366, + "grad_norm": 2.7858753204345703, + "learning_rate": 9.992813800181326e-06, + "loss": 0.0241, + "step": 156 + }, + { + "epoch": 0.13921525160718246, + "grad_norm": 0.45698562264442444, + "learning_rate": 9.992554922950133e-06, + "loss": 0.0133, + "step": 157 + }, + { + "epoch": 0.1401019729549989, + "grad_norm": 1.5275429487228394, + "learning_rate": 9.992291468411293e-06, + "loss": 0.0248, + "step": 158 + }, + { + "epoch": 0.14098869430281535, + "grad_norm": 2.2717480659484863, + "learning_rate": 9.992023436806353e-06, + "loss": 0.0272, + "step": 159 + }, + { + "epoch": 0.14187541565063178, + "grad_norm": 1.3149230480194092, + "learning_rate": 9.991750828381048e-06, + "loss": 0.0182, + "step": 160 + }, + { + "epoch": 0.14276213699844822, + "grad_norm": 1.2786978483200073, + "learning_rate": 9.991473643385317e-06, + "loss": 0.0187, + "step": 161 + }, + { + "epoch": 0.14364885834626467, + "grad_norm": 1.7859911918640137, + "learning_rate": 9.991191882073285e-06, + "loss": 0.0215, + "step": 162 + }, + { + "epoch": 0.14453557969408112, + "grad_norm": 1.611259937286377, + "learning_rate": 9.990905544703281e-06, + "loss": 0.0181, + "step": 163 + }, + { + "epoch": 0.14542230104189757, + "grad_norm": 0.6689061522483826, + "learning_rate": 9.990614631537827e-06, + "loss": 0.0124, + "step": 164 + }, + { + "epoch": 0.14630902238971402, + "grad_norm": 1.0409244298934937, + "learning_rate": 9.990319142843641e-06, + "loss": 0.0204, + "step": 165 + }, + { + "epoch": 0.14719574373753047, + "grad_norm": 0.7819463014602661, + "learning_rate": 9.990019078891633e-06, + "loss": 0.0223, + "step": 166 + }, + { + "epoch": 0.14808246508534692, + "grad_norm": 1.9662169218063354, + "learning_rate": 9.989714439956909e-06, + "loss": 0.0119, + "step": 167 + }, + { + "epoch": 0.14896918643316337, + "grad_norm": 1.353804111480713, + "learning_rate": 9.989405226318772e-06, + "loss": 0.0276, + "step": 168 + }, + { + "epoch": 0.14985590778097982, + "grad_norm": 0.8875593543052673, + "learning_rate": 9.989091438260718e-06, + "loss": 0.0224, + "step": 169 + }, + { + "epoch": 0.15074262912879627, + "grad_norm": 0.862114429473877, + "learning_rate": 9.988773076070433e-06, + "loss": 0.0159, + "step": 170 + }, + { + "epoch": 0.15162935047661272, + "grad_norm": 2.4255783557891846, + "learning_rate": 9.988450140039802e-06, + "loss": 0.0381, + "step": 171 + }, + { + "epoch": 0.15251607182442917, + "grad_norm": 0.9882796406745911, + "learning_rate": 9.988122630464902e-06, + "loss": 0.0182, + "step": 172 + }, + { + "epoch": 0.15340279317224562, + "grad_norm": 1.3533726930618286, + "learning_rate": 9.987790547646003e-06, + "loss": 0.0253, + "step": 173 + }, + { + "epoch": 0.15428951452006207, + "grad_norm": 0.563122034072876, + "learning_rate": 9.987453891887567e-06, + "loss": 0.0185, + "step": 174 + }, + { + "epoch": 0.15517623586787851, + "grad_norm": 0.98648601770401, + "learning_rate": 9.987112663498245e-06, + "loss": 0.0175, + "step": 175 + }, + { + "epoch": 0.15606295721569496, + "grad_norm": 3.2445409297943115, + "learning_rate": 9.986766862790888e-06, + "loss": 0.0206, + "step": 176 + }, + { + "epoch": 0.1569496785635114, + "grad_norm": 0.9300917387008667, + "learning_rate": 9.986416490082537e-06, + "loss": 0.0115, + "step": 177 + }, + { + "epoch": 0.15783639991132786, + "grad_norm": 0.5182569026947021, + "learning_rate": 9.98606154569442e-06, + "loss": 0.0107, + "step": 178 + }, + { + "epoch": 0.1587231212591443, + "grad_norm": 1.1416966915130615, + "learning_rate": 9.98570202995196e-06, + "loss": 0.0116, + "step": 179 + }, + { + "epoch": 0.15960984260696076, + "grad_norm": 0.6441078782081604, + "learning_rate": 9.98533794318477e-06, + "loss": 0.0147, + "step": 180 + }, + { + "epoch": 0.1604965639547772, + "grad_norm": 1.4156014919281006, + "learning_rate": 9.984969285726656e-06, + "loss": 0.0199, + "step": 181 + }, + { + "epoch": 0.16138328530259366, + "grad_norm": 2.0149669647216797, + "learning_rate": 9.984596057915613e-06, + "loss": 0.0323, + "step": 182 + }, + { + "epoch": 0.1622700066504101, + "grad_norm": 1.7955539226531982, + "learning_rate": 9.984218260093826e-06, + "loss": 0.0332, + "step": 183 + }, + { + "epoch": 0.16315672799822656, + "grad_norm": 1.028401494026184, + "learning_rate": 9.98383589260767e-06, + "loss": 0.0174, + "step": 184 + }, + { + "epoch": 0.164043449346043, + "grad_norm": 1.5321674346923828, + "learning_rate": 9.983448955807708e-06, + "loss": 0.028, + "step": 185 + }, + { + "epoch": 0.16493017069385946, + "grad_norm": 1.3364337682724, + "learning_rate": 9.983057450048697e-06, + "loss": 0.0178, + "step": 186 + }, + { + "epoch": 0.1658168920416759, + "grad_norm": 1.4727017879486084, + "learning_rate": 9.982661375689577e-06, + "loss": 0.0177, + "step": 187 + }, + { + "epoch": 0.16670361338949236, + "grad_norm": 0.46738162636756897, + "learning_rate": 9.982260733093482e-06, + "loss": 0.0085, + "step": 188 + }, + { + "epoch": 0.1675903347373088, + "grad_norm": 0.4024590849876404, + "learning_rate": 9.98185552262773e-06, + "loss": 0.0131, + "step": 189 + }, + { + "epoch": 0.16847705608512525, + "grad_norm": 0.5752727389335632, + "learning_rate": 9.98144574466383e-06, + "loss": 0.0111, + "step": 190 + }, + { + "epoch": 0.1693637774329417, + "grad_norm": 0.593545138835907, + "learning_rate": 9.981031399577479e-06, + "loss": 0.012, + "step": 191 + }, + { + "epoch": 0.17025049878075815, + "grad_norm": 0.9298077821731567, + "learning_rate": 9.980612487748556e-06, + "loss": 0.0143, + "step": 192 + }, + { + "epoch": 0.1711372201285746, + "grad_norm": 3.385441780090332, + "learning_rate": 9.980189009561131e-06, + "loss": 0.0192, + "step": 193 + }, + { + "epoch": 0.17202394147639105, + "grad_norm": 1.0534871816635132, + "learning_rate": 9.979760965403462e-06, + "loss": 0.018, + "step": 194 + }, + { + "epoch": 0.1729106628242075, + "grad_norm": 0.6916846036911011, + "learning_rate": 9.97932835566799e-06, + "loss": 0.0078, + "step": 195 + }, + { + "epoch": 0.17379738417202395, + "grad_norm": 0.8603850603103638, + "learning_rate": 9.978891180751346e-06, + "loss": 0.0101, + "step": 196 + }, + { + "epoch": 0.1746841055198404, + "grad_norm": 0.9636616110801697, + "learning_rate": 9.97844944105434e-06, + "loss": 0.0111, + "step": 197 + }, + { + "epoch": 0.17557082686765685, + "grad_norm": 1.4222532510757446, + "learning_rate": 9.978003136981972e-06, + "loss": 0.0158, + "step": 198 + }, + { + "epoch": 0.1764575482154733, + "grad_norm": 1.2078766822814941, + "learning_rate": 9.977552268943426e-06, + "loss": 0.0227, + "step": 199 + }, + { + "epoch": 0.17734426956328975, + "grad_norm": 0.7552307844161987, + "learning_rate": 9.97709683735207e-06, + "loss": 0.0125, + "step": 200 + }, + { + "epoch": 0.1782309909111062, + "grad_norm": 1.0908719301223755, + "learning_rate": 9.976636842625454e-06, + "loss": 0.0235, + "step": 201 + }, + { + "epoch": 0.17911771225892265, + "grad_norm": 1.2128921747207642, + "learning_rate": 9.976172285185315e-06, + "loss": 0.0128, + "step": 202 + }, + { + "epoch": 0.1800044336067391, + "grad_norm": 0.7293077707290649, + "learning_rate": 9.975703165457571e-06, + "loss": 0.0146, + "step": 203 + }, + { + "epoch": 0.18089115495455554, + "grad_norm": 0.995217502117157, + "learning_rate": 9.975229483872325e-06, + "loss": 0.0166, + "step": 204 + }, + { + "epoch": 0.181777876302372, + "grad_norm": 0.7026538252830505, + "learning_rate": 9.974751240863858e-06, + "loss": 0.02, + "step": 205 + }, + { + "epoch": 0.18266459765018841, + "grad_norm": 1.7263981103897095, + "learning_rate": 9.97426843687064e-06, + "loss": 0.0286, + "step": 206 + }, + { + "epoch": 0.18355131899800486, + "grad_norm": 1.4515742063522339, + "learning_rate": 9.973781072335315e-06, + "loss": 0.015, + "step": 207 + }, + { + "epoch": 0.1844380403458213, + "grad_norm": 0.743718683719635, + "learning_rate": 9.973289147704714e-06, + "loss": 0.0183, + "step": 208 + }, + { + "epoch": 0.18532476169363776, + "grad_norm": 0.5708590745925903, + "learning_rate": 9.972792663429847e-06, + "loss": 0.0107, + "step": 209 + }, + { + "epoch": 0.1862114830414542, + "grad_norm": 1.0680079460144043, + "learning_rate": 9.972291619965901e-06, + "loss": 0.0198, + "step": 210 + }, + { + "epoch": 0.18709820438927066, + "grad_norm": 1.7404032945632935, + "learning_rate": 9.971786017772249e-06, + "loss": 0.0187, + "step": 211 + }, + { + "epoch": 0.1879849257370871, + "grad_norm": 1.116582989692688, + "learning_rate": 9.971275857312438e-06, + "loss": 0.0124, + "step": 212 + }, + { + "epoch": 0.18887164708490356, + "grad_norm": 0.9854468107223511, + "learning_rate": 9.9707611390542e-06, + "loss": 0.0146, + "step": 213 + }, + { + "epoch": 0.18975836843272, + "grad_norm": 1.0946894884109497, + "learning_rate": 9.970241863469439e-06, + "loss": 0.0165, + "step": 214 + }, + { + "epoch": 0.19064508978053646, + "grad_norm": 0.6762576699256897, + "learning_rate": 9.969718031034243e-06, + "loss": 0.0083, + "step": 215 + }, + { + "epoch": 0.1915318111283529, + "grad_norm": 0.8591622114181519, + "learning_rate": 9.969189642228874e-06, + "loss": 0.02, + "step": 216 + }, + { + "epoch": 0.19241853247616936, + "grad_norm": 0.9623995423316956, + "learning_rate": 9.968656697537776e-06, + "loss": 0.0145, + "step": 217 + }, + { + "epoch": 0.1933052538239858, + "grad_norm": 1.2979151010513306, + "learning_rate": 9.968119197449564e-06, + "loss": 0.0143, + "step": 218 + }, + { + "epoch": 0.19419197517180226, + "grad_norm": 1.155766487121582, + "learning_rate": 9.967577142457031e-06, + "loss": 0.0167, + "step": 219 + }, + { + "epoch": 0.1950786965196187, + "grad_norm": 2.1985487937927246, + "learning_rate": 9.967030533057155e-06, + "loss": 0.0305, + "step": 220 + }, + { + "epoch": 0.19596541786743515, + "grad_norm": 0.6944717168807983, + "learning_rate": 9.966479369751072e-06, + "loss": 0.0157, + "step": 221 + }, + { + "epoch": 0.1968521392152516, + "grad_norm": 0.7124226093292236, + "learning_rate": 9.96592365304411e-06, + "loss": 0.0255, + "step": 222 + }, + { + "epoch": 0.19773886056306805, + "grad_norm": 1.8563802242279053, + "learning_rate": 9.965363383445762e-06, + "loss": 0.0189, + "step": 223 + }, + { + "epoch": 0.1986255819108845, + "grad_norm": 2.422700881958008, + "learning_rate": 9.9647985614697e-06, + "loss": 0.0249, + "step": 224 + }, + { + "epoch": 0.19951230325870095, + "grad_norm": 1.5919915437698364, + "learning_rate": 9.964229187633767e-06, + "loss": 0.0243, + "step": 225 + }, + { + "epoch": 0.2003990246065174, + "grad_norm": 0.6012934446334839, + "learning_rate": 9.963655262459978e-06, + "loss": 0.0228, + "step": 226 + }, + { + "epoch": 0.20128574595433385, + "grad_norm": 0.7975476384162903, + "learning_rate": 9.963076786474529e-06, + "loss": 0.0203, + "step": 227 + }, + { + "epoch": 0.2021724673021503, + "grad_norm": 0.9103793501853943, + "learning_rate": 9.962493760207775e-06, + "loss": 0.0199, + "step": 228 + }, + { + "epoch": 0.20305918864996675, + "grad_norm": 2.0518851280212402, + "learning_rate": 9.961906184194255e-06, + "loss": 0.0284, + "step": 229 + }, + { + "epoch": 0.2039459099977832, + "grad_norm": 0.6736752390861511, + "learning_rate": 9.961314058972672e-06, + "loss": 0.0115, + "step": 230 + }, + { + "epoch": 0.20483263134559965, + "grad_norm": 0.31807392835617065, + "learning_rate": 9.960717385085904e-06, + "loss": 0.0126, + "step": 231 + }, + { + "epoch": 0.2057193526934161, + "grad_norm": 1.0832537412643433, + "learning_rate": 9.960116163080995e-06, + "loss": 0.0179, + "step": 232 + }, + { + "epoch": 0.20660607404123255, + "grad_norm": 1.3503203392028809, + "learning_rate": 9.959510393509163e-06, + "loss": 0.0191, + "step": 233 + }, + { + "epoch": 0.207492795389049, + "grad_norm": 0.45156577229499817, + "learning_rate": 9.958900076925793e-06, + "loss": 0.0088, + "step": 234 + }, + { + "epoch": 0.20837951673686544, + "grad_norm": 0.5721056461334229, + "learning_rate": 9.958285213890442e-06, + "loss": 0.0159, + "step": 235 + }, + { + "epoch": 0.2092662380846819, + "grad_norm": 1.2899566888809204, + "learning_rate": 9.95766580496683e-06, + "loss": 0.0179, + "step": 236 + }, + { + "epoch": 0.21015295943249834, + "grad_norm": 0.5771740674972534, + "learning_rate": 9.957041850722848e-06, + "loss": 0.0077, + "step": 237 + }, + { + "epoch": 0.2110396807803148, + "grad_norm": 0.7620949745178223, + "learning_rate": 9.956413351730556e-06, + "loss": 0.0163, + "step": 238 + }, + { + "epoch": 0.21192640212813124, + "grad_norm": 1.0351495742797852, + "learning_rate": 9.955780308566174e-06, + "loss": 0.0178, + "step": 239 + }, + { + "epoch": 0.2128131234759477, + "grad_norm": 0.6831864714622498, + "learning_rate": 9.955142721810099e-06, + "loss": 0.0087, + "step": 240 + }, + { + "epoch": 0.21369984482376414, + "grad_norm": 0.6774961948394775, + "learning_rate": 9.954500592046883e-06, + "loss": 0.0123, + "step": 241 + }, + { + "epoch": 0.2145865661715806, + "grad_norm": 1.7236403226852417, + "learning_rate": 9.953853919865251e-06, + "loss": 0.0232, + "step": 242 + }, + { + "epoch": 0.21547328751939704, + "grad_norm": 1.017462134361267, + "learning_rate": 9.953202705858087e-06, + "loss": 0.0176, + "step": 243 + }, + { + "epoch": 0.2163600088672135, + "grad_norm": 0.37702977657318115, + "learning_rate": 9.95254695062244e-06, + "loss": 0.01, + "step": 244 + }, + { + "epoch": 0.21724673021502994, + "grad_norm": 1.5439709424972534, + "learning_rate": 9.951886654759528e-06, + "loss": 0.0193, + "step": 245 + }, + { + "epoch": 0.2181334515628464, + "grad_norm": 0.5491088628768921, + "learning_rate": 9.951221818874724e-06, + "loss": 0.0095, + "step": 246 + }, + { + "epoch": 0.21902017291066284, + "grad_norm": 0.6317766308784485, + "learning_rate": 9.950552443577571e-06, + "loss": 0.0164, + "step": 247 + }, + { + "epoch": 0.21990689425847929, + "grad_norm": 0.9085983037948608, + "learning_rate": 9.949878529481767e-06, + "loss": 0.0137, + "step": 248 + }, + { + "epoch": 0.22079361560629573, + "grad_norm": 0.583099365234375, + "learning_rate": 9.949200077205177e-06, + "loss": 0.012, + "step": 249 + }, + { + "epoch": 0.22168033695411218, + "grad_norm": 0.4570126533508301, + "learning_rate": 9.948517087369822e-06, + "loss": 0.008, + "step": 250 + }, + { + "epoch": 0.2225670583019286, + "grad_norm": 1.1167882680892944, + "learning_rate": 9.947829560601884e-06, + "loss": 0.0181, + "step": 251 + }, + { + "epoch": 0.22345377964974505, + "grad_norm": 0.6278082728385925, + "learning_rate": 9.947137497531711e-06, + "loss": 0.0154, + "step": 252 + }, + { + "epoch": 0.2243405009975615, + "grad_norm": 0.43254172801971436, + "learning_rate": 9.9464408987938e-06, + "loss": 0.0093, + "step": 253 + }, + { + "epoch": 0.22522722234537795, + "grad_norm": 0.45759662985801697, + "learning_rate": 9.945739765026814e-06, + "loss": 0.0105, + "step": 254 + }, + { + "epoch": 0.2261139436931944, + "grad_norm": 0.6438144445419312, + "learning_rate": 9.945034096873571e-06, + "loss": 0.0136, + "step": 255 + }, + { + "epoch": 0.22700066504101085, + "grad_norm": 1.2649673223495483, + "learning_rate": 9.944323894981045e-06, + "loss": 0.0223, + "step": 256 + }, + { + "epoch": 0.2278873863888273, + "grad_norm": 0.7128615975379944, + "learning_rate": 9.943609160000369e-06, + "loss": 0.0123, + "step": 257 + }, + { + "epoch": 0.22877410773664375, + "grad_norm": 1.0316693782806396, + "learning_rate": 9.94288989258683e-06, + "loss": 0.0163, + "step": 258 + }, + { + "epoch": 0.2296608290844602, + "grad_norm": 1.885494589805603, + "learning_rate": 9.942166093399875e-06, + "loss": 0.0174, + "step": 259 + }, + { + "epoch": 0.23054755043227665, + "grad_norm": 0.5127606391906738, + "learning_rate": 9.9414377631031e-06, + "loss": 0.0091, + "step": 260 + }, + { + "epoch": 0.2314342717800931, + "grad_norm": 0.6301166415214539, + "learning_rate": 9.940704902364254e-06, + "loss": 0.0128, + "step": 261 + }, + { + "epoch": 0.23232099312790955, + "grad_norm": 0.6273002624511719, + "learning_rate": 9.93996751185525e-06, + "loss": 0.0131, + "step": 262 + }, + { + "epoch": 0.233207714475726, + "grad_norm": 0.4122670888900757, + "learning_rate": 9.939225592252143e-06, + "loss": 0.006, + "step": 263 + }, + { + "epoch": 0.23409443582354245, + "grad_norm": 0.3675723075866699, + "learning_rate": 9.938479144235146e-06, + "loss": 0.0076, + "step": 264 + }, + { + "epoch": 0.2349811571713589, + "grad_norm": 0.5711640119552612, + "learning_rate": 9.937728168488622e-06, + "loss": 0.0123, + "step": 265 + }, + { + "epoch": 0.23586787851917534, + "grad_norm": 0.5491669774055481, + "learning_rate": 9.936972665701086e-06, + "loss": 0.0094, + "step": 266 + }, + { + "epoch": 0.2367545998669918, + "grad_norm": 0.25534459948539734, + "learning_rate": 9.936212636565205e-06, + "loss": 0.0032, + "step": 267 + }, + { + "epoch": 0.23764132121480824, + "grad_norm": 0.7696008682250977, + "learning_rate": 9.93544808177779e-06, + "loss": 0.0118, + "step": 268 + }, + { + "epoch": 0.2385280425626247, + "grad_norm": 0.8142785429954529, + "learning_rate": 9.934679002039809e-06, + "loss": 0.0202, + "step": 269 + }, + { + "epoch": 0.23941476391044114, + "grad_norm": 0.9304322004318237, + "learning_rate": 9.933905398056371e-06, + "loss": 0.0158, + "step": 270 + }, + { + "epoch": 0.2403014852582576, + "grad_norm": 0.6360547542572021, + "learning_rate": 9.93312727053674e-06, + "loss": 0.0091, + "step": 271 + }, + { + "epoch": 0.24118820660607404, + "grad_norm": 0.5449398756027222, + "learning_rate": 9.932344620194322e-06, + "loss": 0.0112, + "step": 272 + }, + { + "epoch": 0.2420749279538905, + "grad_norm": 0.4042613208293915, + "learning_rate": 9.931557447746675e-06, + "loss": 0.0056, + "step": 273 + }, + { + "epoch": 0.24296164930170694, + "grad_norm": 0.8487951755523682, + "learning_rate": 9.930765753915497e-06, + "loss": 0.012, + "step": 274 + }, + { + "epoch": 0.2438483706495234, + "grad_norm": 0.47606247663497925, + "learning_rate": 9.929969539426634e-06, + "loss": 0.0075, + "step": 275 + }, + { + "epoch": 0.24473509199733984, + "grad_norm": 0.7890058159828186, + "learning_rate": 9.929168805010078e-06, + "loss": 0.0142, + "step": 276 + }, + { + "epoch": 0.2456218133451563, + "grad_norm": 0.6880226135253906, + "learning_rate": 9.928363551399961e-06, + "loss": 0.0123, + "step": 277 + }, + { + "epoch": 0.24650853469297274, + "grad_norm": 0.7886592745780945, + "learning_rate": 9.927553779334565e-06, + "loss": 0.0122, + "step": 278 + }, + { + "epoch": 0.24739525604078919, + "grad_norm": 0.44544312357902527, + "learning_rate": 9.926739489556308e-06, + "loss": 0.006, + "step": 279 + }, + { + "epoch": 0.24828197738860563, + "grad_norm": 0.6304795742034912, + "learning_rate": 9.925920682811752e-06, + "loss": 0.0145, + "step": 280 + }, + { + "epoch": 0.24916869873642208, + "grad_norm": 0.47169551253318787, + "learning_rate": 9.925097359851603e-06, + "loss": 0.0063, + "step": 281 + }, + { + "epoch": 0.25005542008423853, + "grad_norm": 0.43053027987480164, + "learning_rate": 9.924269521430705e-06, + "loss": 0.005, + "step": 282 + }, + { + "epoch": 0.25094214143205495, + "grad_norm": 0.7776206731796265, + "learning_rate": 9.923437168308042e-06, + "loss": 0.0152, + "step": 283 + }, + { + "epoch": 0.25182886277987143, + "grad_norm": 1.4472633600234985, + "learning_rate": 9.922600301246736e-06, + "loss": 0.0307, + "step": 284 + }, + { + "epoch": 0.25271558412768785, + "grad_norm": 0.6039207577705383, + "learning_rate": 9.921758921014052e-06, + "loss": 0.0144, + "step": 285 + }, + { + "epoch": 0.25360230547550433, + "grad_norm": 0.8766511082649231, + "learning_rate": 9.92091302838139e-06, + "loss": 0.0138, + "step": 286 + }, + { + "epoch": 0.25448902682332075, + "grad_norm": 1.0067875385284424, + "learning_rate": 9.920062624124282e-06, + "loss": 0.0193, + "step": 287 + }, + { + "epoch": 0.25537574817113723, + "grad_norm": 0.9771963953971863, + "learning_rate": 9.919207709022407e-06, + "loss": 0.0289, + "step": 288 + }, + { + "epoch": 0.25626246951895365, + "grad_norm": 0.8762062191963196, + "learning_rate": 9.91834828385957e-06, + "loss": 0.0139, + "step": 289 + }, + { + "epoch": 0.2571491908667701, + "grad_norm": 0.9518645405769348, + "learning_rate": 9.917484349423719e-06, + "loss": 0.0079, + "step": 290 + }, + { + "epoch": 0.25803591221458655, + "grad_norm": 0.36570677161216736, + "learning_rate": 9.916615906506927e-06, + "loss": 0.0119, + "step": 291 + }, + { + "epoch": 0.258922633562403, + "grad_norm": 0.5024058818817139, + "learning_rate": 9.91574295590541e-06, + "loss": 0.0105, + "step": 292 + }, + { + "epoch": 0.25980935491021945, + "grad_norm": 1.2302806377410889, + "learning_rate": 9.91486549841951e-06, + "loss": 0.0122, + "step": 293 + }, + { + "epoch": 0.2606960762580359, + "grad_norm": 0.4594174921512604, + "learning_rate": 9.913983534853703e-06, + "loss": 0.0101, + "step": 294 + }, + { + "epoch": 0.26158279760585235, + "grad_norm": 1.1169053316116333, + "learning_rate": 9.913097066016601e-06, + "loss": 0.0161, + "step": 295 + }, + { + "epoch": 0.2624695189536688, + "grad_norm": 0.719704806804657, + "learning_rate": 9.912206092720939e-06, + "loss": 0.008, + "step": 296 + }, + { + "epoch": 0.26335624030148524, + "grad_norm": 0.5716864466667175, + "learning_rate": 9.911310615783583e-06, + "loss": 0.022, + "step": 297 + }, + { + "epoch": 0.2642429616493017, + "grad_norm": 0.6044641137123108, + "learning_rate": 9.910410636025536e-06, + "loss": 0.0152, + "step": 298 + }, + { + "epoch": 0.26512968299711814, + "grad_norm": 0.4611899256706238, + "learning_rate": 9.909506154271917e-06, + "loss": 0.0105, + "step": 299 + }, + { + "epoch": 0.2660164043449346, + "grad_norm": 1.0533437728881836, + "learning_rate": 9.908597171351984e-06, + "loss": 0.0098, + "step": 300 + }, + { + "epoch": 0.26690312569275104, + "grad_norm": 0.5442478060722351, + "learning_rate": 9.907683688099114e-06, + "loss": 0.0069, + "step": 301 + }, + { + "epoch": 0.2677898470405675, + "grad_norm": 0.6688672304153442, + "learning_rate": 9.906765705350814e-06, + "loss": 0.0143, + "step": 302 + }, + { + "epoch": 0.26867656838838394, + "grad_norm": 0.7143788933753967, + "learning_rate": 9.905843223948715e-06, + "loss": 0.0138, + "step": 303 + }, + { + "epoch": 0.2695632897362004, + "grad_norm": 1.6420787572860718, + "learning_rate": 9.904916244738572e-06, + "loss": 0.0206, + "step": 304 + }, + { + "epoch": 0.27045001108401684, + "grad_norm": 0.9799435138702393, + "learning_rate": 9.903984768570264e-06, + "loss": 0.0118, + "step": 305 + }, + { + "epoch": 0.2713367324318333, + "grad_norm": 1.430992841720581, + "learning_rate": 9.903048796297794e-06, + "loss": 0.0176, + "step": 306 + }, + { + "epoch": 0.27222345377964974, + "grad_norm": 6.0022125244140625, + "learning_rate": 9.902108328779287e-06, + "loss": 0.0162, + "step": 307 + }, + { + "epoch": 0.2731101751274662, + "grad_norm": 0.5858060717582703, + "learning_rate": 9.90116336687699e-06, + "loss": 0.0112, + "step": 308 + }, + { + "epoch": 0.27399689647528264, + "grad_norm": 0.7073821425437927, + "learning_rate": 9.900213911457263e-06, + "loss": 0.0156, + "step": 309 + }, + { + "epoch": 0.2748836178230991, + "grad_norm": 0.3150938153266907, + "learning_rate": 9.8992599633906e-06, + "loss": 0.0085, + "step": 310 + }, + { + "epoch": 0.27577033917091553, + "grad_norm": 0.5376273989677429, + "learning_rate": 9.898301523551601e-06, + "loss": 0.0115, + "step": 311 + }, + { + "epoch": 0.276657060518732, + "grad_norm": 0.32037392258644104, + "learning_rate": 9.89733859281899e-06, + "loss": 0.0046, + "step": 312 + }, + { + "epoch": 0.27754378186654843, + "grad_norm": 0.9390332698822021, + "learning_rate": 9.89637117207561e-06, + "loss": 0.0109, + "step": 313 + }, + { + "epoch": 0.2784305032143649, + "grad_norm": 0.5948623418807983, + "learning_rate": 9.895399262208414e-06, + "loss": 0.0051, + "step": 314 + }, + { + "epoch": 0.27931722456218133, + "grad_norm": 0.5544528365135193, + "learning_rate": 9.894422864108479e-06, + "loss": 0.0132, + "step": 315 + }, + { + "epoch": 0.2802039459099978, + "grad_norm": 0.9701108336448669, + "learning_rate": 9.893441978670993e-06, + "loss": 0.0088, + "step": 316 + }, + { + "epoch": 0.28109066725781423, + "grad_norm": 0.6148018836975098, + "learning_rate": 9.892456606795254e-06, + "loss": 0.0048, + "step": 317 + }, + { + "epoch": 0.2819773886056307, + "grad_norm": 0.563202440738678, + "learning_rate": 9.891466749384679e-06, + "loss": 0.0115, + "step": 318 + }, + { + "epoch": 0.28286410995344713, + "grad_norm": 0.6775363683700562, + "learning_rate": 9.890472407346796e-06, + "loss": 0.007, + "step": 319 + }, + { + "epoch": 0.28375083130126355, + "grad_norm": 0.820683479309082, + "learning_rate": 9.889473581593245e-06, + "loss": 0.0162, + "step": 320 + }, + { + "epoch": 0.28463755264908003, + "grad_norm": 1.3535810708999634, + "learning_rate": 9.888470273039776e-06, + "loss": 0.016, + "step": 321 + }, + { + "epoch": 0.28552427399689645, + "grad_norm": 0.42245036363601685, + "learning_rate": 9.88746248260625e-06, + "loss": 0.0038, + "step": 322 + }, + { + "epoch": 0.2864109953447129, + "grad_norm": 0.38233160972595215, + "learning_rate": 9.886450211216631e-06, + "loss": 0.002, + "step": 323 + }, + { + "epoch": 0.28729771669252935, + "grad_norm": 1.296417236328125, + "learning_rate": 9.885433459799003e-06, + "loss": 0.0111, + "step": 324 + }, + { + "epoch": 0.2881844380403458, + "grad_norm": 1.1593570709228516, + "learning_rate": 9.884412229285547e-06, + "loss": 0.0101, + "step": 325 + }, + { + "epoch": 0.28907115938816225, + "grad_norm": 0.44105803966522217, + "learning_rate": 9.883386520612556e-06, + "loss": 0.0113, + "step": 326 + }, + { + "epoch": 0.2899578807359787, + "grad_norm": 1.3377820253372192, + "learning_rate": 9.882356334720426e-06, + "loss": 0.0083, + "step": 327 + }, + { + "epoch": 0.29084460208379515, + "grad_norm": 1.594089150428772, + "learning_rate": 9.881321672553661e-06, + "loss": 0.0242, + "step": 328 + }, + { + "epoch": 0.2917313234316116, + "grad_norm": 0.6307036876678467, + "learning_rate": 9.880282535060863e-06, + "loss": 0.015, + "step": 329 + }, + { + "epoch": 0.29261804477942804, + "grad_norm": 1.0397772789001465, + "learning_rate": 9.879238923194746e-06, + "loss": 0.0209, + "step": 330 + }, + { + "epoch": 0.2935047661272445, + "grad_norm": 1.5129303932189941, + "learning_rate": 9.878190837912118e-06, + "loss": 0.0241, + "step": 331 + }, + { + "epoch": 0.29439148747506094, + "grad_norm": 1.1227725744247437, + "learning_rate": 9.87713828017389e-06, + "loss": 0.0224, + "step": 332 + }, + { + "epoch": 0.2952782088228774, + "grad_norm": 1.045912504196167, + "learning_rate": 9.87608125094508e-06, + "loss": 0.0106, + "step": 333 + }, + { + "epoch": 0.29616493017069384, + "grad_norm": 0.5505847930908203, + "learning_rate": 9.875019751194796e-06, + "loss": 0.0078, + "step": 334 + }, + { + "epoch": 0.2970516515185103, + "grad_norm": 1.4759756326675415, + "learning_rate": 9.873953781896251e-06, + "loss": 0.0229, + "step": 335 + }, + { + "epoch": 0.29793837286632674, + "grad_norm": 1.1660606861114502, + "learning_rate": 9.872883344026755e-06, + "loss": 0.0147, + "step": 336 + }, + { + "epoch": 0.2988250942141432, + "grad_norm": 0.9158754944801331, + "learning_rate": 9.871808438567713e-06, + "loss": 0.01, + "step": 337 + }, + { + "epoch": 0.29971181556195964, + "grad_norm": 0.3273065388202667, + "learning_rate": 9.870729066504629e-06, + "loss": 0.005, + "step": 338 + }, + { + "epoch": 0.3005985369097761, + "grad_norm": 0.857749342918396, + "learning_rate": 9.869645228827097e-06, + "loss": 0.0219, + "step": 339 + }, + { + "epoch": 0.30148525825759254, + "grad_norm": 1.448362112045288, + "learning_rate": 9.868556926528809e-06, + "loss": 0.0227, + "step": 340 + }, + { + "epoch": 0.302371979605409, + "grad_norm": 1.0792773962020874, + "learning_rate": 9.867464160607552e-06, + "loss": 0.0189, + "step": 341 + }, + { + "epoch": 0.30325870095322544, + "grad_norm": 0.3688802421092987, + "learning_rate": 9.866366932065199e-06, + "loss": 0.0032, + "step": 342 + }, + { + "epoch": 0.3041454223010419, + "grad_norm": 0.6941332221031189, + "learning_rate": 9.865265241907722e-06, + "loss": 0.0066, + "step": 343 + }, + { + "epoch": 0.30503214364885833, + "grad_norm": 0.35826510190963745, + "learning_rate": 9.86415909114518e-06, + "loss": 0.0093, + "step": 344 + }, + { + "epoch": 0.3059188649966748, + "grad_norm": 0.45968568325042725, + "learning_rate": 9.863048480791718e-06, + "loss": 0.005, + "step": 345 + }, + { + "epoch": 0.30680558634449123, + "grad_norm": 0.7145997881889343, + "learning_rate": 9.861933411865576e-06, + "loss": 0.0139, + "step": 346 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 1.11654794216156, + "learning_rate": 9.86081388538908e-06, + "loss": 0.0064, + "step": 347 + }, + { + "epoch": 0.30857902904012413, + "grad_norm": 0.5863871574401855, + "learning_rate": 9.859689902388639e-06, + "loss": 0.0119, + "step": 348 + }, + { + "epoch": 0.3094657503879406, + "grad_norm": 0.5604485869407654, + "learning_rate": 9.858561463894752e-06, + "loss": 0.0046, + "step": 349 + }, + { + "epoch": 0.31035247173575703, + "grad_norm": 0.575162410736084, + "learning_rate": 9.857428570942e-06, + "loss": 0.0082, + "step": 350 + }, + { + "epoch": 0.3112391930835735, + "grad_norm": 0.6239818930625916, + "learning_rate": 9.856291224569052e-06, + "loss": 0.0104, + "step": 351 + }, + { + "epoch": 0.31212591443138993, + "grad_norm": 0.768072247505188, + "learning_rate": 9.855149425818657e-06, + "loss": 0.0113, + "step": 352 + }, + { + "epoch": 0.3130126357792064, + "grad_norm": 0.9411647915840149, + "learning_rate": 9.854003175737645e-06, + "loss": 0.0178, + "step": 353 + }, + { + "epoch": 0.3138993571270228, + "grad_norm": 0.3603442907333374, + "learning_rate": 9.85285247537693e-06, + "loss": 0.0067, + "step": 354 + }, + { + "epoch": 0.3147860784748393, + "grad_norm": 2.249364137649536, + "learning_rate": 9.851697325791505e-06, + "loss": 0.0368, + "step": 355 + }, + { + "epoch": 0.3156727998226557, + "grad_norm": 10.260486602783203, + "learning_rate": 9.850537728040442e-06, + "loss": 0.0724, + "step": 356 + }, + { + "epoch": 0.3165595211704722, + "grad_norm": 0.7737874984741211, + "learning_rate": 9.849373683186887e-06, + "loss": 0.0153, + "step": 357 + }, + { + "epoch": 0.3174462425182886, + "grad_norm": 0.9794998168945312, + "learning_rate": 9.848205192298075e-06, + "loss": 0.01, + "step": 358 + }, + { + "epoch": 0.3183329638661051, + "grad_norm": 0.3936000168323517, + "learning_rate": 9.847032256445303e-06, + "loss": 0.0078, + "step": 359 + }, + { + "epoch": 0.3192196852139215, + "grad_norm": 1.2881196737289429, + "learning_rate": 9.845854876703955e-06, + "loss": 0.0279, + "step": 360 + }, + { + "epoch": 0.320106406561738, + "grad_norm": 0.8685862421989441, + "learning_rate": 9.84467305415348e-06, + "loss": 0.0194, + "step": 361 + }, + { + "epoch": 0.3209931279095544, + "grad_norm": 0.48143163323402405, + "learning_rate": 9.843486789877405e-06, + "loss": 0.0098, + "step": 362 + }, + { + "epoch": 0.3218798492573709, + "grad_norm": 0.6279829144477844, + "learning_rate": 9.842296084963332e-06, + "loss": 0.0139, + "step": 363 + }, + { + "epoch": 0.3227665706051873, + "grad_norm": 0.955162763595581, + "learning_rate": 9.84110094050293e-06, + "loss": 0.0157, + "step": 364 + }, + { + "epoch": 0.32365329195300374, + "grad_norm": 0.7307074666023254, + "learning_rate": 9.839901357591935e-06, + "loss": 0.0095, + "step": 365 + }, + { + "epoch": 0.3245400133008202, + "grad_norm": 2.4147238731384277, + "learning_rate": 9.838697337330163e-06, + "loss": 0.0207, + "step": 366 + }, + { + "epoch": 0.32542673464863664, + "grad_norm": 0.3981778025627136, + "learning_rate": 9.837488880821487e-06, + "loss": 0.0051, + "step": 367 + }, + { + "epoch": 0.3263134559964531, + "grad_norm": 0.6395242810249329, + "learning_rate": 9.836275989173855e-06, + "loss": 0.0132, + "step": 368 + }, + { + "epoch": 0.32720017734426954, + "grad_norm": 0.3053138256072998, + "learning_rate": 9.835058663499277e-06, + "loss": 0.0073, + "step": 369 + }, + { + "epoch": 0.328086898692086, + "grad_norm": 1.085937261581421, + "learning_rate": 9.833836904913831e-06, + "loss": 0.011, + "step": 370 + }, + { + "epoch": 0.32897362003990244, + "grad_norm": 0.8119531273841858, + "learning_rate": 9.832610714537655e-06, + "loss": 0.01, + "step": 371 + }, + { + "epoch": 0.3298603413877189, + "grad_norm": 0.6228383183479309, + "learning_rate": 9.831380093494957e-06, + "loss": 0.0047, + "step": 372 + }, + { + "epoch": 0.33074706273553534, + "grad_norm": 0.8070245385169983, + "learning_rate": 9.830145042914003e-06, + "loss": 0.0167, + "step": 373 + }, + { + "epoch": 0.3316337840833518, + "grad_norm": 0.6485422849655151, + "learning_rate": 9.828905563927117e-06, + "loss": 0.015, + "step": 374 + }, + { + "epoch": 0.33252050543116823, + "grad_norm": 1.153606653213501, + "learning_rate": 9.82766165767069e-06, + "loss": 0.0068, + "step": 375 + }, + { + "epoch": 0.3334072267789847, + "grad_norm": 1.0364385843276978, + "learning_rate": 9.826413325285162e-06, + "loss": 0.0159, + "step": 376 + }, + { + "epoch": 0.33429394812680113, + "grad_norm": 0.6059282422065735, + "learning_rate": 9.825160567915045e-06, + "loss": 0.0057, + "step": 377 + }, + { + "epoch": 0.3351806694746176, + "grad_norm": 1.1247456073760986, + "learning_rate": 9.823903386708897e-06, + "loss": 0.0119, + "step": 378 + }, + { + "epoch": 0.33606739082243403, + "grad_norm": 1.2303680181503296, + "learning_rate": 9.822641782819337e-06, + "loss": 0.0223, + "step": 379 + }, + { + "epoch": 0.3369541121702505, + "grad_norm": 0.7362028956413269, + "learning_rate": 9.821375757403035e-06, + "loss": 0.0143, + "step": 380 + }, + { + "epoch": 0.33784083351806693, + "grad_norm": 0.8117135763168335, + "learning_rate": 9.820105311620717e-06, + "loss": 0.0153, + "step": 381 + }, + { + "epoch": 0.3387275548658834, + "grad_norm": 0.6031385064125061, + "learning_rate": 9.818830446637165e-06, + "loss": 0.0177, + "step": 382 + }, + { + "epoch": 0.33961427621369983, + "grad_norm": 0.5231031179428101, + "learning_rate": 9.817551163621205e-06, + "loss": 0.0117, + "step": 383 + }, + { + "epoch": 0.3405009975615163, + "grad_norm": 0.6594051122665405, + "learning_rate": 9.81626746374572e-06, + "loss": 0.0141, + "step": 384 + }, + { + "epoch": 0.3413877189093327, + "grad_norm": 0.4554224908351898, + "learning_rate": 9.814979348187639e-06, + "loss": 0.0128, + "step": 385 + }, + { + "epoch": 0.3422744402571492, + "grad_norm": 0.5283925533294678, + "learning_rate": 9.813686818127943e-06, + "loss": 0.0115, + "step": 386 + }, + { + "epoch": 0.3431611616049656, + "grad_norm": 0.5425133109092712, + "learning_rate": 9.812389874751656e-06, + "loss": 0.0115, + "step": 387 + }, + { + "epoch": 0.3440478829527821, + "grad_norm": 0.5007458925247192, + "learning_rate": 9.81108851924785e-06, + "loss": 0.0115, + "step": 388 + }, + { + "epoch": 0.3449346043005985, + "grad_norm": 0.5451713800430298, + "learning_rate": 9.809782752809644e-06, + "loss": 0.0093, + "step": 389 + }, + { + "epoch": 0.345821325648415, + "grad_norm": 0.6177766919136047, + "learning_rate": 9.808472576634199e-06, + "loss": 0.0087, + "step": 390 + }, + { + "epoch": 0.3467080469962314, + "grad_norm": 0.5689627528190613, + "learning_rate": 9.80715799192272e-06, + "loss": 0.0125, + "step": 391 + }, + { + "epoch": 0.3475947683440479, + "grad_norm": 0.36475062370300293, + "learning_rate": 9.805838999880453e-06, + "loss": 0.0071, + "step": 392 + }, + { + "epoch": 0.3484814896918643, + "grad_norm": 0.7817016839981079, + "learning_rate": 9.804515601716685e-06, + "loss": 0.0151, + "step": 393 + }, + { + "epoch": 0.3493682110396808, + "grad_norm": 0.42941737174987793, + "learning_rate": 9.80318779864474e-06, + "loss": 0.0038, + "step": 394 + }, + { + "epoch": 0.3502549323874972, + "grad_norm": 0.5236615538597107, + "learning_rate": 9.801855591881989e-06, + "loss": 0.0089, + "step": 395 + }, + { + "epoch": 0.3511416537353137, + "grad_norm": 0.4579971134662628, + "learning_rate": 9.80051898264983e-06, + "loss": 0.0094, + "step": 396 + }, + { + "epoch": 0.3520283750831301, + "grad_norm": 0.9356654286384583, + "learning_rate": 9.799177972173706e-06, + "loss": 0.0188, + "step": 397 + }, + { + "epoch": 0.3529150964309466, + "grad_norm": 0.664996325969696, + "learning_rate": 9.797832561683087e-06, + "loss": 0.016, + "step": 398 + }, + { + "epoch": 0.353801817778763, + "grad_norm": 0.8114444613456726, + "learning_rate": 9.796482752411482e-06, + "loss": 0.0101, + "step": 399 + }, + { + "epoch": 0.3546885391265795, + "grad_norm": 0.5470486283302307, + "learning_rate": 9.795128545596436e-06, + "loss": 0.0082, + "step": 400 + }, + { + "epoch": 0.3555752604743959, + "grad_norm": 0.716001570224762, + "learning_rate": 9.793769942479518e-06, + "loss": 0.0154, + "step": 401 + }, + { + "epoch": 0.3564619818222124, + "grad_norm": 0.8142730593681335, + "learning_rate": 9.792406944306334e-06, + "loss": 0.0098, + "step": 402 + }, + { + "epoch": 0.3573487031700288, + "grad_norm": 0.8549275398254395, + "learning_rate": 9.791039552326515e-06, + "loss": 0.0065, + "step": 403 + }, + { + "epoch": 0.3582354245178453, + "grad_norm": 0.5257267951965332, + "learning_rate": 9.789667767793725e-06, + "loss": 0.015, + "step": 404 + }, + { + "epoch": 0.3591221458656617, + "grad_norm": 0.48406821489334106, + "learning_rate": 9.788291591965651e-06, + "loss": 0.0041, + "step": 405 + }, + { + "epoch": 0.3600088672134782, + "grad_norm": 1.1776056289672852, + "learning_rate": 9.786911026104007e-06, + "loss": 0.0137, + "step": 406 + }, + { + "epoch": 0.3608955885612946, + "grad_norm": 0.8189646005630493, + "learning_rate": 9.785526071474533e-06, + "loss": 0.0187, + "step": 407 + }, + { + "epoch": 0.3617823099091111, + "grad_norm": 1.4497982263565063, + "learning_rate": 9.784136729346994e-06, + "loss": 0.0203, + "step": 408 + }, + { + "epoch": 0.3626690312569275, + "grad_norm": 0.32031527161598206, + "learning_rate": 9.782743000995175e-06, + "loss": 0.0052, + "step": 409 + }, + { + "epoch": 0.363555752604744, + "grad_norm": 0.3057740330696106, + "learning_rate": 9.781344887696884e-06, + "loss": 0.0074, + "step": 410 + }, + { + "epoch": 0.3644424739525604, + "grad_norm": 0.6097119450569153, + "learning_rate": 9.779942390733948e-06, + "loss": 0.0124, + "step": 411 + }, + { + "epoch": 0.36532919530037683, + "grad_norm": 0.25449925661087036, + "learning_rate": 9.77853551139221e-06, + "loss": 0.0053, + "step": 412 + }, + { + "epoch": 0.3662159166481933, + "grad_norm": 0.43352335691452026, + "learning_rate": 9.777124250961543e-06, + "loss": 0.0094, + "step": 413 + }, + { + "epoch": 0.36710263799600973, + "grad_norm": 0.5348715782165527, + "learning_rate": 9.775708610735821e-06, + "loss": 0.0081, + "step": 414 + }, + { + "epoch": 0.3679893593438262, + "grad_norm": 0.3730040192604065, + "learning_rate": 9.774288592012944e-06, + "loss": 0.0052, + "step": 415 + }, + { + "epoch": 0.3688760806916426, + "grad_norm": 0.6741971373558044, + "learning_rate": 9.77286419609482e-06, + "loss": 0.0112, + "step": 416 + }, + { + "epoch": 0.3697628020394591, + "grad_norm": 0.6674507260322571, + "learning_rate": 9.771435424287377e-06, + "loss": 0.0128, + "step": 417 + }, + { + "epoch": 0.3706495233872755, + "grad_norm": 0.7770352363586426, + "learning_rate": 9.77000227790055e-06, + "loss": 0.0139, + "step": 418 + }, + { + "epoch": 0.371536244735092, + "grad_norm": 0.6249715089797974, + "learning_rate": 9.768564758248285e-06, + "loss": 0.0141, + "step": 419 + }, + { + "epoch": 0.3724229660829084, + "grad_norm": 0.42244604229927063, + "learning_rate": 9.76712286664854e-06, + "loss": 0.0047, + "step": 420 + }, + { + "epoch": 0.3733096874307249, + "grad_norm": 0.6065423488616943, + "learning_rate": 9.765676604423277e-06, + "loss": 0.0128, + "step": 421 + }, + { + "epoch": 0.3741964087785413, + "grad_norm": 0.9267554879188538, + "learning_rate": 9.76422597289847e-06, + "loss": 0.0172, + "step": 422 + }, + { + "epoch": 0.3750831301263578, + "grad_norm": 0.6917939782142639, + "learning_rate": 9.762770973404094e-06, + "loss": 0.0138, + "step": 423 + }, + { + "epoch": 0.3759698514741742, + "grad_norm": 0.4068993330001831, + "learning_rate": 9.761311607274135e-06, + "loss": 0.0036, + "step": 424 + }, + { + "epoch": 0.3768565728219907, + "grad_norm": 1.279029369354248, + "learning_rate": 9.759847875846578e-06, + "loss": 0.0099, + "step": 425 + }, + { + "epoch": 0.3777432941698071, + "grad_norm": 0.6679139137268066, + "learning_rate": 9.758379780463409e-06, + "loss": 0.0067, + "step": 426 + }, + { + "epoch": 0.3786300155176236, + "grad_norm": 1.1384234428405762, + "learning_rate": 9.756907322470619e-06, + "loss": 0.0142, + "step": 427 + }, + { + "epoch": 0.37951673686544, + "grad_norm": 0.9144220948219299, + "learning_rate": 9.755430503218197e-06, + "loss": 0.0088, + "step": 428 + }, + { + "epoch": 0.3804034582132565, + "grad_norm": 0.9335492849349976, + "learning_rate": 9.753949324060127e-06, + "loss": 0.0129, + "step": 429 + }, + { + "epoch": 0.3812901795610729, + "grad_norm": 0.5396351218223572, + "learning_rate": 9.7524637863544e-06, + "loss": 0.0096, + "step": 430 + }, + { + "epoch": 0.3821769009088894, + "grad_norm": 0.5314906239509583, + "learning_rate": 9.75097389146299e-06, + "loss": 0.0097, + "step": 431 + }, + { + "epoch": 0.3830636222567058, + "grad_norm": 0.6313880681991577, + "learning_rate": 9.749479640751876e-06, + "loss": 0.0095, + "step": 432 + }, + { + "epoch": 0.3839503436045223, + "grad_norm": 0.955323338508606, + "learning_rate": 9.747981035591025e-06, + "loss": 0.0121, + "step": 433 + }, + { + "epoch": 0.3848370649523387, + "grad_norm": 1.0209813117980957, + "learning_rate": 9.7464780773544e-06, + "loss": 0.0126, + "step": 434 + }, + { + "epoch": 0.3857237863001552, + "grad_norm": 0.4658578038215637, + "learning_rate": 9.744970767419952e-06, + "loss": 0.0082, + "step": 435 + }, + { + "epoch": 0.3866105076479716, + "grad_norm": 0.6823182702064514, + "learning_rate": 9.743459107169624e-06, + "loss": 0.013, + "step": 436 + }, + { + "epoch": 0.3874972289957881, + "grad_norm": 0.2446506917476654, + "learning_rate": 9.741943097989345e-06, + "loss": 0.0038, + "step": 437 + }, + { + "epoch": 0.3883839503436045, + "grad_norm": 0.42889147996902466, + "learning_rate": 9.740422741269035e-06, + "loss": 0.0099, + "step": 438 + }, + { + "epoch": 0.389270671691421, + "grad_norm": 0.30183982849121094, + "learning_rate": 9.738898038402597e-06, + "loss": 0.0064, + "step": 439 + }, + { + "epoch": 0.3901573930392374, + "grad_norm": 1.023352861404419, + "learning_rate": 9.737368990787917e-06, + "loss": 0.0071, + "step": 440 + }, + { + "epoch": 0.3910441143870539, + "grad_norm": 0.39613592624664307, + "learning_rate": 9.735835599826868e-06, + "loss": 0.0062, + "step": 441 + }, + { + "epoch": 0.3919308357348703, + "grad_norm": 1.0003458261489868, + "learning_rate": 9.734297866925305e-06, + "loss": 0.0076, + "step": 442 + }, + { + "epoch": 0.3928175570826868, + "grad_norm": 0.4582412838935852, + "learning_rate": 9.732755793493062e-06, + "loss": 0.008, + "step": 443 + }, + { + "epoch": 0.3937042784305032, + "grad_norm": 0.969808042049408, + "learning_rate": 9.731209380943951e-06, + "loss": 0.0152, + "step": 444 + }, + { + "epoch": 0.3945909997783197, + "grad_norm": 0.38053014874458313, + "learning_rate": 9.729658630695768e-06, + "loss": 0.0033, + "step": 445 + }, + { + "epoch": 0.3954777211261361, + "grad_norm": 0.8101393580436707, + "learning_rate": 9.72810354417028e-06, + "loss": 0.0111, + "step": 446 + }, + { + "epoch": 0.3963644424739526, + "grad_norm": 0.7292132377624512, + "learning_rate": 9.72654412279323e-06, + "loss": 0.0086, + "step": 447 + }, + { + "epoch": 0.397251163821769, + "grad_norm": 0.44195911288261414, + "learning_rate": 9.724980367994341e-06, + "loss": 0.0058, + "step": 448 + }, + { + "epoch": 0.3981378851695855, + "grad_norm": 0.2693403959274292, + "learning_rate": 9.723412281207304e-06, + "loss": 0.0019, + "step": 449 + }, + { + "epoch": 0.3990246065174019, + "grad_norm": 0.2798308730125427, + "learning_rate": 9.721839863869783e-06, + "loss": 0.0051, + "step": 450 + }, + { + "epoch": 0.3999113278652184, + "grad_norm": 0.8944824934005737, + "learning_rate": 9.720263117423408e-06, + "loss": 0.0111, + "step": 451 + }, + { + "epoch": 0.4007980492130348, + "grad_norm": 0.4507511258125305, + "learning_rate": 9.718682043313787e-06, + "loss": 0.0084, + "step": 452 + }, + { + "epoch": 0.4016847705608513, + "grad_norm": 0.4930645823478699, + "learning_rate": 9.71709664299049e-06, + "loss": 0.0149, + "step": 453 + }, + { + "epoch": 0.4025714919086677, + "grad_norm": 1.0915964841842651, + "learning_rate": 9.715506917907055e-06, + "loss": 0.0116, + "step": 454 + }, + { + "epoch": 0.4034582132564842, + "grad_norm": 0.45030245184898376, + "learning_rate": 9.713912869520982e-06, + "loss": 0.0067, + "step": 455 + }, + { + "epoch": 0.4043449346043006, + "grad_norm": 0.6651377081871033, + "learning_rate": 9.71231449929374e-06, + "loss": 0.0071, + "step": 456 + }, + { + "epoch": 0.405231655952117, + "grad_norm": 0.5960193276405334, + "learning_rate": 9.710711808690754e-06, + "loss": 0.0066, + "step": 457 + }, + { + "epoch": 0.4061183772999335, + "grad_norm": 0.290302038192749, + "learning_rate": 9.709104799181418e-06, + "loss": 0.0046, + "step": 458 + }, + { + "epoch": 0.4070050986477499, + "grad_norm": 0.7967752814292908, + "learning_rate": 9.707493472239075e-06, + "loss": 0.0062, + "step": 459 + }, + { + "epoch": 0.4078918199955664, + "grad_norm": 0.759286642074585, + "learning_rate": 9.70587782934104e-06, + "loss": 0.0184, + "step": 460 + }, + { + "epoch": 0.4087785413433828, + "grad_norm": 0.5127438306808472, + "learning_rate": 9.704257871968573e-06, + "loss": 0.0117, + "step": 461 + }, + { + "epoch": 0.4096652626911993, + "grad_norm": 0.4300234913825989, + "learning_rate": 9.702633601606893e-06, + "loss": 0.0052, + "step": 462 + }, + { + "epoch": 0.4105519840390157, + "grad_norm": 0.3789842128753662, + "learning_rate": 9.701005019745178e-06, + "loss": 0.0087, + "step": 463 + }, + { + "epoch": 0.4114387053868322, + "grad_norm": 0.6187935471534729, + "learning_rate": 9.699372127876555e-06, + "loss": 0.0104, + "step": 464 + }, + { + "epoch": 0.4123254267346486, + "grad_norm": 0.8972629904747009, + "learning_rate": 9.697734927498099e-06, + "loss": 0.0253, + "step": 465 + }, + { + "epoch": 0.4132121480824651, + "grad_norm": 0.24438129365444183, + "learning_rate": 9.696093420110843e-06, + "loss": 0.0052, + "step": 466 + }, + { + "epoch": 0.4140988694302815, + "grad_norm": 1.079038381576538, + "learning_rate": 9.694447607219763e-06, + "loss": 0.01, + "step": 467 + }, + { + "epoch": 0.414985590778098, + "grad_norm": 0.382589727640152, + "learning_rate": 9.692797490333785e-06, + "loss": 0.0049, + "step": 468 + }, + { + "epoch": 0.4158723121259144, + "grad_norm": 0.7200171947479248, + "learning_rate": 9.69114307096578e-06, + "loss": 0.0095, + "step": 469 + }, + { + "epoch": 0.4167590334737309, + "grad_norm": 0.6705244779586792, + "learning_rate": 9.689484350632564e-06, + "loss": 0.0085, + "step": 470 + }, + { + "epoch": 0.4176457548215473, + "grad_norm": 0.5431262254714966, + "learning_rate": 9.687821330854894e-06, + "loss": 0.0071, + "step": 471 + }, + { + "epoch": 0.4185324761693638, + "grad_norm": 0.5441691875457764, + "learning_rate": 9.686154013157474e-06, + "loss": 0.009, + "step": 472 + }, + { + "epoch": 0.4194191975171802, + "grad_norm": 1.1173381805419922, + "learning_rate": 9.684482399068944e-06, + "loss": 0.0156, + "step": 473 + }, + { + "epoch": 0.4203059188649967, + "grad_norm": 1.0084078311920166, + "learning_rate": 9.682806490121886e-06, + "loss": 0.0102, + "step": 474 + }, + { + "epoch": 0.4211926402128131, + "grad_norm": 0.3242226243019104, + "learning_rate": 9.681126287852814e-06, + "loss": 0.0048, + "step": 475 + }, + { + "epoch": 0.4220793615606296, + "grad_norm": 1.2636985778808594, + "learning_rate": 9.679441793802186e-06, + "loss": 0.0172, + "step": 476 + }, + { + "epoch": 0.422966082908446, + "grad_norm": 0.8478817343711853, + "learning_rate": 9.67775300951439e-06, + "loss": 0.0134, + "step": 477 + }, + { + "epoch": 0.4238528042562625, + "grad_norm": 0.8426191806793213, + "learning_rate": 9.67605993653775e-06, + "loss": 0.0132, + "step": 478 + }, + { + "epoch": 0.4247395256040789, + "grad_norm": 0.661734938621521, + "learning_rate": 9.674362576424516e-06, + "loss": 0.0096, + "step": 479 + }, + { + "epoch": 0.4256262469518954, + "grad_norm": 0.6070396900177002, + "learning_rate": 9.67266093073088e-06, + "loss": 0.0167, + "step": 480 + }, + { + "epoch": 0.4265129682997118, + "grad_norm": 0.9344643950462341, + "learning_rate": 9.670955001016949e-06, + "loss": 0.0115, + "step": 481 + }, + { + "epoch": 0.4273996896475283, + "grad_norm": 0.843940019607544, + "learning_rate": 9.669244788846769e-06, + "loss": 0.0079, + "step": 482 + }, + { + "epoch": 0.4282864109953447, + "grad_norm": 0.26441389322280884, + "learning_rate": 9.667530295788307e-06, + "loss": 0.0025, + "step": 483 + }, + { + "epoch": 0.4291731323431612, + "grad_norm": 0.34116652607917786, + "learning_rate": 9.665811523413457e-06, + "loss": 0.0047, + "step": 484 + }, + { + "epoch": 0.4300598536909776, + "grad_norm": 0.37784484028816223, + "learning_rate": 9.664088473298035e-06, + "loss": 0.0086, + "step": 485 + }, + { + "epoch": 0.4309465750387941, + "grad_norm": 0.44266486167907715, + "learning_rate": 9.66236114702178e-06, + "loss": 0.014, + "step": 486 + }, + { + "epoch": 0.4318332963866105, + "grad_norm": 0.689039409160614, + "learning_rate": 9.66062954616835e-06, + "loss": 0.005, + "step": 487 + }, + { + "epoch": 0.432720017734427, + "grad_norm": 1.1525181531906128, + "learning_rate": 9.65889367232532e-06, + "loss": 0.0115, + "step": 488 + }, + { + "epoch": 0.4336067390822434, + "grad_norm": 0.36996787786483765, + "learning_rate": 9.657153527084191e-06, + "loss": 0.0064, + "step": 489 + }, + { + "epoch": 0.4344934604300599, + "grad_norm": 0.2661356031894684, + "learning_rate": 9.655409112040372e-06, + "loss": 0.004, + "step": 490 + }, + { + "epoch": 0.4353801817778763, + "grad_norm": 0.7092905640602112, + "learning_rate": 9.653660428793188e-06, + "loss": 0.0134, + "step": 491 + }, + { + "epoch": 0.4362669031256928, + "grad_norm": 0.9240937829017639, + "learning_rate": 9.651907478945882e-06, + "loss": 0.0128, + "step": 492 + }, + { + "epoch": 0.4371536244735092, + "grad_norm": 0.1982317417860031, + "learning_rate": 9.650150264105599e-06, + "loss": 0.0021, + "step": 493 + }, + { + "epoch": 0.43804034582132567, + "grad_norm": 0.790959358215332, + "learning_rate": 9.648388785883407e-06, + "loss": 0.011, + "step": 494 + }, + { + "epoch": 0.4389270671691421, + "grad_norm": 0.6713638305664062, + "learning_rate": 9.646623045894274e-06, + "loss": 0.0118, + "step": 495 + }, + { + "epoch": 0.43981378851695857, + "grad_norm": 0.5016664862632751, + "learning_rate": 9.644853045757077e-06, + "loss": 0.0055, + "step": 496 + }, + { + "epoch": 0.440700509864775, + "grad_norm": 0.5184798836708069, + "learning_rate": 9.6430787870946e-06, + "loss": 0.0159, + "step": 497 + }, + { + "epoch": 0.44158723121259147, + "grad_norm": 0.4628726541996002, + "learning_rate": 9.641300271533528e-06, + "loss": 0.0029, + "step": 498 + }, + { + "epoch": 0.4424739525604079, + "grad_norm": 0.5552617311477661, + "learning_rate": 9.639517500704456e-06, + "loss": 0.0075, + "step": 499 + }, + { + "epoch": 0.44336067390822437, + "grad_norm": 0.5783096551895142, + "learning_rate": 9.637730476241874e-06, + "loss": 0.0074, + "step": 500 + }, + { + "epoch": 0.4442473952560408, + "grad_norm": 0.46593576669692993, + "learning_rate": 9.635939199784173e-06, + "loss": 0.0039, + "step": 501 + }, + { + "epoch": 0.4451341166038572, + "grad_norm": 0.29269683361053467, + "learning_rate": 9.634143672973644e-06, + "loss": 0.0023, + "step": 502 + }, + { + "epoch": 0.4460208379516737, + "grad_norm": 0.41590166091918945, + "learning_rate": 9.632343897456473e-06, + "loss": 0.0125, + "step": 503 + }, + { + "epoch": 0.4469075592994901, + "grad_norm": 0.27357831597328186, + "learning_rate": 9.630539874882742e-06, + "loss": 0.0054, + "step": 504 + }, + { + "epoch": 0.4477942806473066, + "grad_norm": 0.602550745010376, + "learning_rate": 9.62873160690643e-06, + "loss": 0.0069, + "step": 505 + }, + { + "epoch": 0.448681001995123, + "grad_norm": 0.5262980461120605, + "learning_rate": 9.626919095185403e-06, + "loss": 0.0041, + "step": 506 + }, + { + "epoch": 0.4495677233429395, + "grad_norm": 0.5450761914253235, + "learning_rate": 9.625102341381418e-06, + "loss": 0.0089, + "step": 507 + }, + { + "epoch": 0.4504544446907559, + "grad_norm": 0.6254885792732239, + "learning_rate": 9.623281347160129e-06, + "loss": 0.0071, + "step": 508 + }, + { + "epoch": 0.4513411660385724, + "grad_norm": 0.4506608545780182, + "learning_rate": 9.621456114191067e-06, + "loss": 0.0051, + "step": 509 + }, + { + "epoch": 0.4522278873863888, + "grad_norm": 0.663757860660553, + "learning_rate": 9.619626644147655e-06, + "loss": 0.0071, + "step": 510 + }, + { + "epoch": 0.4531146087342053, + "grad_norm": 0.5022022128105164, + "learning_rate": 9.617792938707204e-06, + "loss": 0.0098, + "step": 511 + }, + { + "epoch": 0.4540013300820217, + "grad_norm": 0.46550077199935913, + "learning_rate": 9.615954999550899e-06, + "loss": 0.0047, + "step": 512 + }, + { + "epoch": 0.4548880514298382, + "grad_norm": 0.8453051447868347, + "learning_rate": 9.614112828363814e-06, + "loss": 0.0121, + "step": 513 + }, + { + "epoch": 0.4557747727776546, + "grad_norm": 0.44652843475341797, + "learning_rate": 9.612266426834901e-06, + "loss": 0.004, + "step": 514 + }, + { + "epoch": 0.4566614941254711, + "grad_norm": 0.6860093474388123, + "learning_rate": 9.61041579665699e-06, + "loss": 0.0059, + "step": 515 + }, + { + "epoch": 0.4575482154732875, + "grad_norm": 0.7249019742012024, + "learning_rate": 9.60856093952679e-06, + "loss": 0.0057, + "step": 516 + }, + { + "epoch": 0.458434936821104, + "grad_norm": 0.5832751393318176, + "learning_rate": 9.60670185714488e-06, + "loss": 0.0079, + "step": 517 + }, + { + "epoch": 0.4593216581689204, + "grad_norm": 0.7885969281196594, + "learning_rate": 9.604838551215715e-06, + "loss": 0.0071, + "step": 518 + }, + { + "epoch": 0.4602083795167369, + "grad_norm": 0.6885710954666138, + "learning_rate": 9.602971023447632e-06, + "loss": 0.0135, + "step": 519 + }, + { + "epoch": 0.4610951008645533, + "grad_norm": 1.1160459518432617, + "learning_rate": 9.60109927555282e-06, + "loss": 0.0111, + "step": 520 + }, + { + "epoch": 0.4619818222123698, + "grad_norm": 1.2547369003295898, + "learning_rate": 9.599223309247356e-06, + "loss": 0.0192, + "step": 521 + }, + { + "epoch": 0.4628685435601862, + "grad_norm": 0.7981187701225281, + "learning_rate": 9.59734312625117e-06, + "loss": 0.0046, + "step": 522 + }, + { + "epoch": 0.4637552649080027, + "grad_norm": 0.8385836482048035, + "learning_rate": 9.595458728288068e-06, + "loss": 0.01, + "step": 523 + }, + { + "epoch": 0.4646419862558191, + "grad_norm": 0.44900402426719666, + "learning_rate": 9.593570117085714e-06, + "loss": 0.0127, + "step": 524 + }, + { + "epoch": 0.46552870760363557, + "grad_norm": 0.8929427266120911, + "learning_rate": 9.591677294375637e-06, + "loss": 0.0075, + "step": 525 + }, + { + "epoch": 0.466415428951452, + "grad_norm": 0.3609806299209595, + "learning_rate": 9.589780261893229e-06, + "loss": 0.0055, + "step": 526 + }, + { + "epoch": 0.46730215029926847, + "grad_norm": 1.0652682781219482, + "learning_rate": 9.587879021377739e-06, + "loss": 0.0136, + "step": 527 + }, + { + "epoch": 0.4681888716470849, + "grad_norm": 0.5709818601608276, + "learning_rate": 9.585973574572276e-06, + "loss": 0.0068, + "step": 528 + }, + { + "epoch": 0.46907559299490137, + "grad_norm": 0.6579173803329468, + "learning_rate": 9.584063923223804e-06, + "loss": 0.0057, + "step": 529 + }, + { + "epoch": 0.4699623143427178, + "grad_norm": 0.2939516603946686, + "learning_rate": 9.582150069083141e-06, + "loss": 0.0063, + "step": 530 + }, + { + "epoch": 0.47084903569053427, + "grad_norm": 0.4318031072616577, + "learning_rate": 9.58023201390496e-06, + "loss": 0.0048, + "step": 531 + }, + { + "epoch": 0.4717357570383507, + "grad_norm": 0.5636759996414185, + "learning_rate": 9.578309759447786e-06, + "loss": 0.0109, + "step": 532 + }, + { + "epoch": 0.47262247838616717, + "grad_norm": 0.8026735186576843, + "learning_rate": 9.576383307473995e-06, + "loss": 0.0105, + "step": 533 + }, + { + "epoch": 0.4735091997339836, + "grad_norm": 0.6912361979484558, + "learning_rate": 9.574452659749807e-06, + "loss": 0.0052, + "step": 534 + }, + { + "epoch": 0.47439592108180006, + "grad_norm": 0.6018496751785278, + "learning_rate": 9.572517818045291e-06, + "loss": 0.0186, + "step": 535 + }, + { + "epoch": 0.4752826424296165, + "grad_norm": 0.4667069911956787, + "learning_rate": 9.570578784134364e-06, + "loss": 0.0062, + "step": 536 + }, + { + "epoch": 0.47616936377743296, + "grad_norm": 0.8536850810050964, + "learning_rate": 9.568635559794782e-06, + "loss": 0.0088, + "step": 537 + }, + { + "epoch": 0.4770560851252494, + "grad_norm": 0.7802487015724182, + "learning_rate": 9.566688146808147e-06, + "loss": 0.0141, + "step": 538 + }, + { + "epoch": 0.47794280647306586, + "grad_norm": 0.5833753347396851, + "learning_rate": 9.564736546959896e-06, + "loss": 0.0085, + "step": 539 + }, + { + "epoch": 0.4788295278208823, + "grad_norm": 0.6617295742034912, + "learning_rate": 9.56278076203931e-06, + "loss": 0.0074, + "step": 540 + }, + { + "epoch": 0.47971624916869876, + "grad_norm": 1.122158408164978, + "learning_rate": 9.560820793839505e-06, + "loss": 0.011, + "step": 541 + }, + { + "epoch": 0.4806029705165152, + "grad_norm": 0.49062931537628174, + "learning_rate": 9.558856644157432e-06, + "loss": 0.0065, + "step": 542 + }, + { + "epoch": 0.48148969186433166, + "grad_norm": 0.7522017955780029, + "learning_rate": 9.556888314793875e-06, + "loss": 0.0126, + "step": 543 + }, + { + "epoch": 0.4823764132121481, + "grad_norm": 1.2089123725891113, + "learning_rate": 9.554915807553451e-06, + "loss": 0.007, + "step": 544 + }, + { + "epoch": 0.48326313455996456, + "grad_norm": 0.5262856483459473, + "learning_rate": 9.552939124244608e-06, + "loss": 0.0071, + "step": 545 + }, + { + "epoch": 0.484149855907781, + "grad_norm": 0.5715947151184082, + "learning_rate": 9.550958266679623e-06, + "loss": 0.0052, + "step": 546 + }, + { + "epoch": 0.4850365772555974, + "grad_norm": 0.6762679815292358, + "learning_rate": 9.548973236674598e-06, + "loss": 0.0089, + "step": 547 + }, + { + "epoch": 0.4859232986034139, + "grad_norm": 0.4889708161354065, + "learning_rate": 9.546984036049461e-06, + "loss": 0.0063, + "step": 548 + }, + { + "epoch": 0.4868100199512303, + "grad_norm": 0.41651228070259094, + "learning_rate": 9.544990666627964e-06, + "loss": 0.0129, + "step": 549 + }, + { + "epoch": 0.4876967412990468, + "grad_norm": 0.41512253880500793, + "learning_rate": 9.542993130237683e-06, + "loss": 0.0085, + "step": 550 + }, + { + "epoch": 0.4885834626468632, + "grad_norm": 0.5797951221466064, + "learning_rate": 9.54099142871001e-06, + "loss": 0.0089, + "step": 551 + }, + { + "epoch": 0.4894701839946797, + "grad_norm": 1.1996581554412842, + "learning_rate": 9.53898556388016e-06, + "loss": 0.0132, + "step": 552 + }, + { + "epoch": 0.4903569053424961, + "grad_norm": 0.9442786574363708, + "learning_rate": 9.536975537587167e-06, + "loss": 0.0093, + "step": 553 + }, + { + "epoch": 0.4912436266903126, + "grad_norm": 0.9179630279541016, + "learning_rate": 9.534961351673871e-06, + "loss": 0.0212, + "step": 554 + }, + { + "epoch": 0.492130348038129, + "grad_norm": 0.34069445729255676, + "learning_rate": 9.532943007986932e-06, + "loss": 0.0052, + "step": 555 + }, + { + "epoch": 0.4930170693859455, + "grad_norm": 1.1509453058242798, + "learning_rate": 9.530920508376824e-06, + "loss": 0.0125, + "step": 556 + }, + { + "epoch": 0.4939037907337619, + "grad_norm": 0.3179965019226074, + "learning_rate": 9.52889385469783e-06, + "loss": 0.0063, + "step": 557 + }, + { + "epoch": 0.49479051208157837, + "grad_norm": 0.35437366366386414, + "learning_rate": 9.526863048808035e-06, + "loss": 0.0051, + "step": 558 + }, + { + "epoch": 0.4956772334293948, + "grad_norm": 0.43903347849845886, + "learning_rate": 9.52482809256934e-06, + "loss": 0.0067, + "step": 559 + }, + { + "epoch": 0.49656395477721127, + "grad_norm": 0.5896758437156677, + "learning_rate": 9.522788987847447e-06, + "loss": 0.0101, + "step": 560 + }, + { + "epoch": 0.4974506761250277, + "grad_norm": 0.1301652044057846, + "learning_rate": 9.520745736511861e-06, + "loss": 0.0018, + "step": 561 + }, + { + "epoch": 0.49833739747284417, + "grad_norm": 1.0899876356124878, + "learning_rate": 9.518698340435886e-06, + "loss": 0.0127, + "step": 562 + }, + { + "epoch": 0.4992241188206606, + "grad_norm": 0.6057047247886658, + "learning_rate": 9.516646801496636e-06, + "loss": 0.0063, + "step": 563 + }, + { + "epoch": 0.5001108401684771, + "grad_norm": 0.6446989178657532, + "learning_rate": 9.514591121575012e-06, + "loss": 0.0059, + "step": 564 + }, + { + "epoch": 0.5001108401684771, + "eval_loss": 0.021343758329749107, + "eval_runtime": 59.8604, + "eval_samples_per_second": 3.207, + "eval_steps_per_second": 0.802, + "step": 564 + }, + { + "epoch": 0.5009975615162935, + "grad_norm": 1.243457555770874, + "learning_rate": 9.512531302555717e-06, + "loss": 0.015, + "step": 565 + }, + { + "epoch": 0.5018842828641099, + "grad_norm": 0.9451794028282166, + "learning_rate": 9.51046734632725e-06, + "loss": 0.013, + "step": 566 + }, + { + "epoch": 0.5027710042119264, + "grad_norm": 0.5004759430885315, + "learning_rate": 9.508399254781899e-06, + "loss": 0.0091, + "step": 567 + }, + { + "epoch": 0.5036577255597429, + "grad_norm": 0.5445528626441956, + "learning_rate": 9.506327029815746e-06, + "loss": 0.0099, + "step": 568 + }, + { + "epoch": 0.5045444469075593, + "grad_norm": 0.5810771584510803, + "learning_rate": 9.504250673328661e-06, + "loss": 0.0123, + "step": 569 + }, + { + "epoch": 0.5054311682553757, + "grad_norm": 0.7572499513626099, + "learning_rate": 9.502170187224307e-06, + "loss": 0.008, + "step": 570 + }, + { + "epoch": 0.5063178896031922, + "grad_norm": 0.4947316348552704, + "learning_rate": 9.500085573410126e-06, + "loss": 0.0038, + "step": 571 + }, + { + "epoch": 0.5072046109510087, + "grad_norm": 0.2767347991466522, + "learning_rate": 9.497996833797348e-06, + "loss": 0.0033, + "step": 572 + }, + { + "epoch": 0.5080913322988251, + "grad_norm": 0.7010060548782349, + "learning_rate": 9.495903970300986e-06, + "loss": 0.0101, + "step": 573 + }, + { + "epoch": 0.5089780536466415, + "grad_norm": 0.7284058332443237, + "learning_rate": 9.493806984839833e-06, + "loss": 0.0098, + "step": 574 + }, + { + "epoch": 0.509864774994458, + "grad_norm": 0.42266321182250977, + "learning_rate": 9.49170587933646e-06, + "loss": 0.0048, + "step": 575 + }, + { + "epoch": 0.5107514963422745, + "grad_norm": 0.40918147563934326, + "learning_rate": 9.489600655717217e-06, + "loss": 0.0071, + "step": 576 + }, + { + "epoch": 0.5116382176900909, + "grad_norm": 0.3350435793399811, + "learning_rate": 9.487491315912231e-06, + "loss": 0.0118, + "step": 577 + }, + { + "epoch": 0.5125249390379073, + "grad_norm": 0.4555787146091461, + "learning_rate": 9.485377861855398e-06, + "loss": 0.0057, + "step": 578 + }, + { + "epoch": 0.5134116603857238, + "grad_norm": 0.9364067912101746, + "learning_rate": 9.483260295484393e-06, + "loss": 0.0119, + "step": 579 + }, + { + "epoch": 0.5142983817335403, + "grad_norm": 1.019731879234314, + "learning_rate": 9.481138618740655e-06, + "loss": 0.0143, + "step": 580 + }, + { + "epoch": 0.5151851030813567, + "grad_norm": 0.4347856342792511, + "learning_rate": 9.479012833569394e-06, + "loss": 0.0046, + "step": 581 + }, + { + "epoch": 0.5160718244291731, + "grad_norm": 0.34786656498908997, + "learning_rate": 9.476882941919587e-06, + "loss": 0.0068, + "step": 582 + }, + { + "epoch": 0.5169585457769896, + "grad_norm": 0.9139060378074646, + "learning_rate": 9.474748945743974e-06, + "loss": 0.0094, + "step": 583 + }, + { + "epoch": 0.517845267124806, + "grad_norm": 0.23097054660320282, + "learning_rate": 9.472610846999061e-06, + "loss": 0.0021, + "step": 584 + }, + { + "epoch": 0.5187319884726225, + "grad_norm": 0.5011057257652283, + "learning_rate": 9.470468647645116e-06, + "loss": 0.0054, + "step": 585 + }, + { + "epoch": 0.5196187098204389, + "grad_norm": 0.5539005398750305, + "learning_rate": 9.468322349646162e-06, + "loss": 0.0113, + "step": 586 + }, + { + "epoch": 0.5205054311682554, + "grad_norm": 0.5280828475952148, + "learning_rate": 9.466171954969981e-06, + "loss": 0.0088, + "step": 587 + }, + { + "epoch": 0.5213921525160718, + "grad_norm": 0.7050261497497559, + "learning_rate": 9.464017465588115e-06, + "loss": 0.0099, + "step": 588 + }, + { + "epoch": 0.5222788738638883, + "grad_norm": 0.2560153007507324, + "learning_rate": 9.461858883475859e-06, + "loss": 0.0018, + "step": 589 + }, + { + "epoch": 0.5231655952117047, + "grad_norm": 0.5633629560470581, + "learning_rate": 9.459696210612253e-06, + "loss": 0.0105, + "step": 590 + }, + { + "epoch": 0.5240523165595212, + "grad_norm": 0.7544528841972351, + "learning_rate": 9.457529448980098e-06, + "loss": 0.0102, + "step": 591 + }, + { + "epoch": 0.5249390379073376, + "grad_norm": 0.36814677715301514, + "learning_rate": 9.455358600565936e-06, + "loss": 0.0059, + "step": 592 + }, + { + "epoch": 0.5258257592551541, + "grad_norm": 0.8715342283248901, + "learning_rate": 9.453183667360062e-06, + "loss": 0.0124, + "step": 593 + }, + { + "epoch": 0.5267124806029705, + "grad_norm": 0.36388614773750305, + "learning_rate": 9.451004651356511e-06, + "loss": 0.0033, + "step": 594 + }, + { + "epoch": 0.5275992019507869, + "grad_norm": 0.6503794193267822, + "learning_rate": 9.448821554553061e-06, + "loss": 0.007, + "step": 595 + }, + { + "epoch": 0.5284859232986034, + "grad_norm": 1.0000611543655396, + "learning_rate": 9.446634378951236e-06, + "loss": 0.0137, + "step": 596 + }, + { + "epoch": 0.5293726446464199, + "grad_norm": 0.4502328634262085, + "learning_rate": 9.444443126556297e-06, + "loss": 0.0058, + "step": 597 + }, + { + "epoch": 0.5302593659942363, + "grad_norm": 0.5919182300567627, + "learning_rate": 9.442247799377242e-06, + "loss": 0.0066, + "step": 598 + }, + { + "epoch": 0.5311460873420527, + "grad_norm": 0.4766238033771515, + "learning_rate": 9.440048399426805e-06, + "loss": 0.0048, + "step": 599 + }, + { + "epoch": 0.5320328086898692, + "grad_norm": 0.26894569396972656, + "learning_rate": 9.437844928721455e-06, + "loss": 0.007, + "step": 600 + }, + { + "epoch": 0.5329195300376857, + "grad_norm": 1.3799517154693604, + "learning_rate": 9.435637389281395e-06, + "loss": 0.0072, + "step": 601 + }, + { + "epoch": 0.5338062513855021, + "grad_norm": 0.3927518129348755, + "learning_rate": 9.433425783130554e-06, + "loss": 0.0024, + "step": 602 + }, + { + "epoch": 0.5346929727333185, + "grad_norm": 0.2583082616329193, + "learning_rate": 9.43121011229659e-06, + "loss": 0.0032, + "step": 603 + }, + { + "epoch": 0.535579694081135, + "grad_norm": 0.6682037115097046, + "learning_rate": 9.428990378810891e-06, + "loss": 0.0076, + "step": 604 + }, + { + "epoch": 0.5364664154289515, + "grad_norm": 1.4830431938171387, + "learning_rate": 9.42676658470857e-06, + "loss": 0.0072, + "step": 605 + }, + { + "epoch": 0.5373531367767679, + "grad_norm": 0.5184667706489563, + "learning_rate": 9.424538732028457e-06, + "loss": 0.0058, + "step": 606 + }, + { + "epoch": 0.5382398581245843, + "grad_norm": 0.9888753294944763, + "learning_rate": 9.422306822813108e-06, + "loss": 0.0104, + "step": 607 + }, + { + "epoch": 0.5391265794724008, + "grad_norm": 0.4412868916988373, + "learning_rate": 9.420070859108799e-06, + "loss": 0.0045, + "step": 608 + }, + { + "epoch": 0.5400133008202173, + "grad_norm": 0.5253145694732666, + "learning_rate": 9.417830842965519e-06, + "loss": 0.0064, + "step": 609 + }, + { + "epoch": 0.5409000221680337, + "grad_norm": 0.2791012227535248, + "learning_rate": 9.415586776436973e-06, + "loss": 0.0026, + "step": 610 + }, + { + "epoch": 0.5417867435158501, + "grad_norm": 1.9227204322814941, + "learning_rate": 9.413338661580587e-06, + "loss": 0.0086, + "step": 611 + }, + { + "epoch": 0.5426734648636666, + "grad_norm": 0.3563939332962036, + "learning_rate": 9.411086500457486e-06, + "loss": 0.0032, + "step": 612 + }, + { + "epoch": 0.543560186211483, + "grad_norm": 0.4303697645664215, + "learning_rate": 9.408830295132516e-06, + "loss": 0.0033, + "step": 613 + }, + { + "epoch": 0.5444469075592995, + "grad_norm": 0.502570629119873, + "learning_rate": 9.406570047674223e-06, + "loss": 0.0036, + "step": 614 + }, + { + "epoch": 0.5453336289071159, + "grad_norm": 0.7179825305938721, + "learning_rate": 9.404305760154866e-06, + "loss": 0.0112, + "step": 615 + }, + { + "epoch": 0.5462203502549324, + "grad_norm": 1.023728847503662, + "learning_rate": 9.402037434650402e-06, + "loss": 0.0149, + "step": 616 + }, + { + "epoch": 0.5471070716027489, + "grad_norm": 1.108109951019287, + "learning_rate": 9.399765073240491e-06, + "loss": 0.0105, + "step": 617 + }, + { + "epoch": 0.5479937929505653, + "grad_norm": 0.9911279082298279, + "learning_rate": 9.397488678008498e-06, + "loss": 0.0014, + "step": 618 + }, + { + "epoch": 0.5488805142983817, + "grad_norm": 0.9575443863868713, + "learning_rate": 9.395208251041482e-06, + "loss": 0.0132, + "step": 619 + }, + { + "epoch": 0.5497672356461982, + "grad_norm": 0.9429709315299988, + "learning_rate": 9.392923794430196e-06, + "loss": 0.0119, + "step": 620 + }, + { + "epoch": 0.5506539569940146, + "grad_norm": 0.7063897848129272, + "learning_rate": 9.390635310269094e-06, + "loss": 0.0094, + "step": 621 + }, + { + "epoch": 0.5515406783418311, + "grad_norm": 0.5705755352973938, + "learning_rate": 9.388342800656319e-06, + "loss": 0.009, + "step": 622 + }, + { + "epoch": 0.5524273996896475, + "grad_norm": 0.67104572057724, + "learning_rate": 9.386046267693705e-06, + "loss": 0.0069, + "step": 623 + }, + { + "epoch": 0.553314121037464, + "grad_norm": 0.44542428851127625, + "learning_rate": 9.383745713486774e-06, + "loss": 0.0095, + "step": 624 + }, + { + "epoch": 0.5542008423852804, + "grad_norm": 0.3004834055900574, + "learning_rate": 9.381441140144735e-06, + "loss": 0.004, + "step": 625 + }, + { + "epoch": 0.5550875637330969, + "grad_norm": 0.48007500171661377, + "learning_rate": 9.379132549780486e-06, + "loss": 0.008, + "step": 626 + }, + { + "epoch": 0.5559742850809133, + "grad_norm": 0.8524070382118225, + "learning_rate": 9.376819944510598e-06, + "loss": 0.0128, + "step": 627 + }, + { + "epoch": 0.5568610064287298, + "grad_norm": 1.298292875289917, + "learning_rate": 9.374503326455335e-06, + "loss": 0.0154, + "step": 628 + }, + { + "epoch": 0.5577477277765462, + "grad_norm": 0.2932794690132141, + "learning_rate": 9.37218269773863e-06, + "loss": 0.0027, + "step": 629 + }, + { + "epoch": 0.5586344491243627, + "grad_norm": 0.5051419138908386, + "learning_rate": 9.369858060488102e-06, + "loss": 0.0077, + "step": 630 + }, + { + "epoch": 0.5595211704721791, + "grad_norm": 0.48923778533935547, + "learning_rate": 9.367529416835038e-06, + "loss": 0.0108, + "step": 631 + }, + { + "epoch": 0.5604078918199956, + "grad_norm": 0.7815966606140137, + "learning_rate": 9.365196768914399e-06, + "loss": 0.0146, + "step": 632 + }, + { + "epoch": 0.561294613167812, + "grad_norm": 0.3096315562725067, + "learning_rate": 9.362860118864822e-06, + "loss": 0.005, + "step": 633 + }, + { + "epoch": 0.5621813345156285, + "grad_norm": 0.6307308077812195, + "learning_rate": 9.360519468828608e-06, + "loss": 0.0101, + "step": 634 + }, + { + "epoch": 0.5630680558634449, + "grad_norm": 0.5058624744415283, + "learning_rate": 9.358174820951727e-06, + "loss": 0.0071, + "step": 635 + }, + { + "epoch": 0.5639547772112614, + "grad_norm": 0.35703045129776, + "learning_rate": 9.355826177383818e-06, + "loss": 0.0054, + "step": 636 + }, + { + "epoch": 0.5648414985590778, + "grad_norm": 0.9188088178634644, + "learning_rate": 9.353473540278175e-06, + "loss": 0.0133, + "step": 637 + }, + { + "epoch": 0.5657282199068943, + "grad_norm": 0.7231817245483398, + "learning_rate": 9.351116911791765e-06, + "loss": 0.0061, + "step": 638 + }, + { + "epoch": 0.5666149412547107, + "grad_norm": 0.551129937171936, + "learning_rate": 9.348756294085202e-06, + "loss": 0.0045, + "step": 639 + }, + { + "epoch": 0.5675016626025271, + "grad_norm": 1.0952398777008057, + "learning_rate": 9.346391689322768e-06, + "loss": 0.0096, + "step": 640 + }, + { + "epoch": 0.5683883839503436, + "grad_norm": 2.5634279251098633, + "learning_rate": 9.344023099672392e-06, + "loss": 0.012, + "step": 641 + }, + { + "epoch": 0.5692751052981601, + "grad_norm": 0.5959585309028625, + "learning_rate": 9.34165052730566e-06, + "loss": 0.0038, + "step": 642 + }, + { + "epoch": 0.5701618266459765, + "grad_norm": 0.8079138398170471, + "learning_rate": 9.339273974397814e-06, + "loss": 0.0057, + "step": 643 + }, + { + "epoch": 0.5710485479937929, + "grad_norm": 1.0635994672775269, + "learning_rate": 9.336893443127739e-06, + "loss": 0.0041, + "step": 644 + }, + { + "epoch": 0.5719352693416094, + "grad_norm": 0.2653323709964752, + "learning_rate": 9.334508935677968e-06, + "loss": 0.0016, + "step": 645 + }, + { + "epoch": 0.5728219906894259, + "grad_norm": 0.32824158668518066, + "learning_rate": 9.332120454234682e-06, + "loss": 0.0032, + "step": 646 + }, + { + "epoch": 0.5737087120372423, + "grad_norm": 0.31941306591033936, + "learning_rate": 9.329728000987706e-06, + "loss": 0.0023, + "step": 647 + }, + { + "epoch": 0.5745954333850587, + "grad_norm": 0.3932683765888214, + "learning_rate": 9.327331578130503e-06, + "loss": 0.0021, + "step": 648 + }, + { + "epoch": 0.5754821547328752, + "grad_norm": 1.0215529203414917, + "learning_rate": 9.324931187860179e-06, + "loss": 0.01, + "step": 649 + }, + { + "epoch": 0.5763688760806917, + "grad_norm": 1.4922890663146973, + "learning_rate": 9.322526832377473e-06, + "loss": 0.0035, + "step": 650 + }, + { + "epoch": 0.5772555974285081, + "grad_norm": 0.16321395337581635, + "learning_rate": 9.32011851388677e-06, + "loss": 0.001, + "step": 651 + }, + { + "epoch": 0.5781423187763245, + "grad_norm": 0.8923147320747375, + "learning_rate": 9.31770623459607e-06, + "loss": 0.0041, + "step": 652 + }, + { + "epoch": 0.579029040124141, + "grad_norm": 2.212514877319336, + "learning_rate": 9.315289996717022e-06, + "loss": 0.0159, + "step": 653 + }, + { + "epoch": 0.5799157614719574, + "grad_norm": 0.5469567775726318, + "learning_rate": 9.312869802464896e-06, + "loss": 0.0026, + "step": 654 + }, + { + "epoch": 0.5808024828197739, + "grad_norm": 0.2924470901489258, + "learning_rate": 9.310445654058589e-06, + "loss": 0.0016, + "step": 655 + }, + { + "epoch": 0.5816892041675903, + "grad_norm": 0.450457364320755, + "learning_rate": 9.308017553720628e-06, + "loss": 0.005, + "step": 656 + }, + { + "epoch": 0.5825759255154068, + "grad_norm": 0.2175384908914566, + "learning_rate": 9.305585503677158e-06, + "loss": 0.0011, + "step": 657 + }, + { + "epoch": 0.5834626468632232, + "grad_norm": 1.5817656517028809, + "learning_rate": 9.303149506157948e-06, + "loss": 0.0081, + "step": 658 + }, + { + "epoch": 0.5843493682110397, + "grad_norm": 0.8667041063308716, + "learning_rate": 9.300709563396386e-06, + "loss": 0.0096, + "step": 659 + }, + { + "epoch": 0.5852360895588561, + "grad_norm": 0.665524423122406, + "learning_rate": 9.298265677629476e-06, + "loss": 0.0045, + "step": 660 + }, + { + "epoch": 0.5861228109066726, + "grad_norm": 0.8336466550827026, + "learning_rate": 9.295817851097836e-06, + "loss": 0.0079, + "step": 661 + }, + { + "epoch": 0.587009532254489, + "grad_norm": 0.8972154259681702, + "learning_rate": 9.293366086045703e-06, + "loss": 0.0069, + "step": 662 + }, + { + "epoch": 0.5878962536023055, + "grad_norm": 0.594464123249054, + "learning_rate": 9.290910384720918e-06, + "loss": 0.0127, + "step": 663 + }, + { + "epoch": 0.5887829749501219, + "grad_norm": 0.5330869555473328, + "learning_rate": 9.288450749374933e-06, + "loss": 0.0073, + "step": 664 + }, + { + "epoch": 0.5896696962979384, + "grad_norm": 0.673941433429718, + "learning_rate": 9.28598718226281e-06, + "loss": 0.0055, + "step": 665 + }, + { + "epoch": 0.5905564176457548, + "grad_norm": 0.4206015169620514, + "learning_rate": 9.28351968564321e-06, + "loss": 0.0045, + "step": 666 + }, + { + "epoch": 0.5914431389935713, + "grad_norm": 0.4695059657096863, + "learning_rate": 9.2810482617784e-06, + "loss": 0.007, + "step": 667 + }, + { + "epoch": 0.5923298603413877, + "grad_norm": 0.5791723728179932, + "learning_rate": 9.27857291293425e-06, + "loss": 0.0082, + "step": 668 + }, + { + "epoch": 0.5932165816892042, + "grad_norm": 0.4505319595336914, + "learning_rate": 9.276093641380224e-06, + "loss": 0.007, + "step": 669 + }, + { + "epoch": 0.5941033030370206, + "grad_norm": 0.38367578387260437, + "learning_rate": 9.273610449389384e-06, + "loss": 0.0061, + "step": 670 + }, + { + "epoch": 0.5949900243848371, + "grad_norm": 0.2096157670021057, + "learning_rate": 9.271123339238387e-06, + "loss": 0.0045, + "step": 671 + }, + { + "epoch": 0.5958767457326535, + "grad_norm": 0.2905729115009308, + "learning_rate": 9.268632313207484e-06, + "loss": 0.009, + "step": 672 + }, + { + "epoch": 0.59676346708047, + "grad_norm": 0.40436604619026184, + "learning_rate": 9.266137373580512e-06, + "loss": 0.0057, + "step": 673 + }, + { + "epoch": 0.5976501884282864, + "grad_norm": 0.6021425724029541, + "learning_rate": 9.263638522644898e-06, + "loss": 0.0095, + "step": 674 + }, + { + "epoch": 0.5985369097761029, + "grad_norm": 0.39547786116600037, + "learning_rate": 9.261135762691658e-06, + "loss": 0.0073, + "step": 675 + }, + { + "epoch": 0.5994236311239193, + "grad_norm": 0.4780794382095337, + "learning_rate": 9.258629096015385e-06, + "loss": 0.0098, + "step": 676 + }, + { + "epoch": 0.6003103524717358, + "grad_norm": 0.5975167751312256, + "learning_rate": 9.256118524914263e-06, + "loss": 0.0146, + "step": 677 + }, + { + "epoch": 0.6011970738195522, + "grad_norm": 0.6294581890106201, + "learning_rate": 9.253604051690047e-06, + "loss": 0.0063, + "step": 678 + }, + { + "epoch": 0.6020837951673687, + "grad_norm": 0.28941860795021057, + "learning_rate": 9.251085678648072e-06, + "loss": 0.0034, + "step": 679 + }, + { + "epoch": 0.6029705165151851, + "grad_norm": 0.46086496114730835, + "learning_rate": 9.248563408097252e-06, + "loss": 0.0045, + "step": 680 + }, + { + "epoch": 0.6038572378630016, + "grad_norm": 0.2607171833515167, + "learning_rate": 9.246037242350072e-06, + "loss": 0.0035, + "step": 681 + }, + { + "epoch": 0.604743959210818, + "grad_norm": 0.5795825123786926, + "learning_rate": 9.24350718372259e-06, + "loss": 0.0069, + "step": 682 + }, + { + "epoch": 0.6056306805586344, + "grad_norm": 0.5818445086479187, + "learning_rate": 9.240973234534426e-06, + "loss": 0.0096, + "step": 683 + }, + { + "epoch": 0.6065174019064509, + "grad_norm": 0.3651141822338104, + "learning_rate": 9.238435397108777e-06, + "loss": 0.0061, + "step": 684 + }, + { + "epoch": 0.6074041232542673, + "grad_norm": 0.5181947350502014, + "learning_rate": 9.235893673772397e-06, + "loss": 0.0123, + "step": 685 + }, + { + "epoch": 0.6082908446020838, + "grad_norm": 0.4905543923377991, + "learning_rate": 9.233348066855607e-06, + "loss": 0.0031, + "step": 686 + }, + { + "epoch": 0.6091775659499002, + "grad_norm": 1.2415827512741089, + "learning_rate": 9.23079857869229e-06, + "loss": 0.004, + "step": 687 + }, + { + "epoch": 0.6100642872977167, + "grad_norm": 0.5994253158569336, + "learning_rate": 9.228245211619881e-06, + "loss": 0.0076, + "step": 688 + }, + { + "epoch": 0.6109510086455331, + "grad_norm": 0.2408173829317093, + "learning_rate": 9.225687967979377e-06, + "loss": 0.0029, + "step": 689 + }, + { + "epoch": 0.6118377299933496, + "grad_norm": 0.7248222827911377, + "learning_rate": 9.223126850115327e-06, + "loss": 0.0074, + "step": 690 + }, + { + "epoch": 0.612724451341166, + "grad_norm": 1.5161305665969849, + "learning_rate": 9.220561860375831e-06, + "loss": 0.0124, + "step": 691 + }, + { + "epoch": 0.6136111726889825, + "grad_norm": 0.6087315678596497, + "learning_rate": 9.217993001112544e-06, + "loss": 0.0065, + "step": 692 + }, + { + "epoch": 0.6144978940367989, + "grad_norm": 0.4666323661804199, + "learning_rate": 9.215420274680658e-06, + "loss": 0.0035, + "step": 693 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.32539889216423035, + "learning_rate": 9.212843683438923e-06, + "loss": 0.0045, + "step": 694 + }, + { + "epoch": 0.6162713367324318, + "grad_norm": 0.44837233424186707, + "learning_rate": 9.210263229749626e-06, + "loss": 0.0073, + "step": 695 + }, + { + "epoch": 0.6171580580802483, + "grad_norm": 0.7816086411476135, + "learning_rate": 9.207678915978593e-06, + "loss": 0.007, + "step": 696 + }, + { + "epoch": 0.6180447794280647, + "grad_norm": 0.4992975890636444, + "learning_rate": 9.205090744495193e-06, + "loss": 0.0066, + "step": 697 + }, + { + "epoch": 0.6189315007758812, + "grad_norm": 0.5404097437858582, + "learning_rate": 9.202498717672332e-06, + "loss": 0.0058, + "step": 698 + }, + { + "epoch": 0.6198182221236976, + "grad_norm": 0.4112424850463867, + "learning_rate": 9.19990283788645e-06, + "loss": 0.0055, + "step": 699 + }, + { + "epoch": 0.6207049434715141, + "grad_norm": 0.5358926653862, + "learning_rate": 9.197303107517516e-06, + "loss": 0.0063, + "step": 700 + }, + { + "epoch": 0.6215916648193305, + "grad_norm": 0.4793187975883484, + "learning_rate": 9.194699528949034e-06, + "loss": 0.005, + "step": 701 + }, + { + "epoch": 0.622478386167147, + "grad_norm": 0.4965543746948242, + "learning_rate": 9.192092104568033e-06, + "loss": 0.0065, + "step": 702 + }, + { + "epoch": 0.6233651075149634, + "grad_norm": 0.31342917680740356, + "learning_rate": 9.189480836765071e-06, + "loss": 0.0048, + "step": 703 + }, + { + "epoch": 0.6242518288627799, + "grad_norm": 0.8319897651672363, + "learning_rate": 9.186865727934227e-06, + "loss": 0.0062, + "step": 704 + }, + { + "epoch": 0.6251385502105963, + "grad_norm": 0.6753369569778442, + "learning_rate": 9.184246780473105e-06, + "loss": 0.0092, + "step": 705 + }, + { + "epoch": 0.6260252715584128, + "grad_norm": 0.7414891719818115, + "learning_rate": 9.181623996782822e-06, + "loss": 0.0129, + "step": 706 + }, + { + "epoch": 0.6269119929062292, + "grad_norm": 0.48528799414634705, + "learning_rate": 9.178997379268018e-06, + "loss": 0.0049, + "step": 707 + }, + { + "epoch": 0.6277987142540457, + "grad_norm": 0.3304223418235779, + "learning_rate": 9.176366930336846e-06, + "loss": 0.0074, + "step": 708 + }, + { + "epoch": 0.6286854356018621, + "grad_norm": 0.19392916560173035, + "learning_rate": 9.173732652400972e-06, + "loss": 0.0031, + "step": 709 + }, + { + "epoch": 0.6295721569496786, + "grad_norm": 0.641016960144043, + "learning_rate": 9.171094547875574e-06, + "loss": 0.008, + "step": 710 + }, + { + "epoch": 0.630458878297495, + "grad_norm": 0.3769802451133728, + "learning_rate": 9.168452619179334e-06, + "loss": 0.0039, + "step": 711 + }, + { + "epoch": 0.6313455996453115, + "grad_norm": 0.6935088038444519, + "learning_rate": 9.165806868734444e-06, + "loss": 0.0057, + "step": 712 + }, + { + "epoch": 0.6322323209931279, + "grad_norm": 0.44623953104019165, + "learning_rate": 9.163157298966596e-06, + "loss": 0.0053, + "step": 713 + }, + { + "epoch": 0.6331190423409444, + "grad_norm": 0.2738344669342041, + "learning_rate": 9.16050391230499e-06, + "loss": 0.0019, + "step": 714 + }, + { + "epoch": 0.6340057636887608, + "grad_norm": 0.6001261472702026, + "learning_rate": 9.157846711182322e-06, + "loss": 0.0059, + "step": 715 + }, + { + "epoch": 0.6348924850365772, + "grad_norm": 0.6489748954772949, + "learning_rate": 9.15518569803478e-06, + "loss": 0.0094, + "step": 716 + }, + { + "epoch": 0.6357792063843937, + "grad_norm": 0.14771291613578796, + "learning_rate": 9.152520875302058e-06, + "loss": 0.0009, + "step": 717 + }, + { + "epoch": 0.6366659277322102, + "grad_norm": 1.016381025314331, + "learning_rate": 9.149852245427333e-06, + "loss": 0.0251, + "step": 718 + }, + { + "epoch": 0.6375526490800266, + "grad_norm": 0.5366118550300598, + "learning_rate": 9.147179810857277e-06, + "loss": 0.0039, + "step": 719 + }, + { + "epoch": 0.638439370427843, + "grad_norm": 0.6103517413139343, + "learning_rate": 9.144503574042047e-06, + "loss": 0.0071, + "step": 720 + }, + { + "epoch": 0.6393260917756595, + "grad_norm": 0.5155932903289795, + "learning_rate": 9.141823537435294e-06, + "loss": 0.0066, + "step": 721 + }, + { + "epoch": 0.640212813123476, + "grad_norm": 1.691421627998352, + "learning_rate": 9.139139703494141e-06, + "loss": 0.0066, + "step": 722 + }, + { + "epoch": 0.6410995344712924, + "grad_norm": 0.6509506702423096, + "learning_rate": 9.136452074679204e-06, + "loss": 0.0078, + "step": 723 + }, + { + "epoch": 0.6419862558191088, + "grad_norm": 0.38033562898635864, + "learning_rate": 9.133760653454568e-06, + "loss": 0.0058, + "step": 724 + }, + { + "epoch": 0.6428729771669253, + "grad_norm": 0.7521248459815979, + "learning_rate": 9.131065442287803e-06, + "loss": 0.0102, + "step": 725 + }, + { + "epoch": 0.6437596985147418, + "grad_norm": 0.6125398278236389, + "learning_rate": 9.12836644364995e-06, + "loss": 0.0059, + "step": 726 + }, + { + "epoch": 0.6446464198625582, + "grad_norm": 0.5620765686035156, + "learning_rate": 9.125663660015523e-06, + "loss": 0.0115, + "step": 727 + }, + { + "epoch": 0.6455331412103746, + "grad_norm": 0.38775548338890076, + "learning_rate": 9.122957093862504e-06, + "loss": 0.0042, + "step": 728 + }, + { + "epoch": 0.6464198625581911, + "grad_norm": 0.60417240858078, + "learning_rate": 9.120246747672347e-06, + "loss": 0.0091, + "step": 729 + }, + { + "epoch": 0.6473065839060075, + "grad_norm": 0.2996869385242462, + "learning_rate": 9.117532623929969e-06, + "loss": 0.0041, + "step": 730 + }, + { + "epoch": 0.648193305253824, + "grad_norm": 0.5002415180206299, + "learning_rate": 9.114814725123755e-06, + "loss": 0.0096, + "step": 731 + }, + { + "epoch": 0.6490800266016404, + "grad_norm": 0.34925490617752075, + "learning_rate": 9.112093053745541e-06, + "loss": 0.0086, + "step": 732 + }, + { + "epoch": 0.6499667479494569, + "grad_norm": 0.3351205885410309, + "learning_rate": 9.109367612290633e-06, + "loss": 0.0033, + "step": 733 + }, + { + "epoch": 0.6508534692972733, + "grad_norm": 0.7079493999481201, + "learning_rate": 9.106638403257786e-06, + "loss": 0.0115, + "step": 734 + }, + { + "epoch": 0.6517401906450898, + "grad_norm": 0.6996093392372131, + "learning_rate": 9.103905429149212e-06, + "loss": 0.0066, + "step": 735 + }, + { + "epoch": 0.6526269119929062, + "grad_norm": 0.8951324820518494, + "learning_rate": 9.101168692470574e-06, + "loss": 0.01, + "step": 736 + }, + { + "epoch": 0.6535136333407227, + "grad_norm": 0.5505021810531616, + "learning_rate": 9.098428195730987e-06, + "loss": 0.0062, + "step": 737 + }, + { + "epoch": 0.6544003546885391, + "grad_norm": 0.3233986496925354, + "learning_rate": 9.09568394144301e-06, + "loss": 0.0044, + "step": 738 + }, + { + "epoch": 0.6552870760363556, + "grad_norm": 0.3201133608818054, + "learning_rate": 9.092935932122648e-06, + "loss": 0.0045, + "step": 739 + }, + { + "epoch": 0.656173797384172, + "grad_norm": 0.7146524786949158, + "learning_rate": 9.090184170289351e-06, + "loss": 0.0066, + "step": 740 + }, + { + "epoch": 0.6570605187319885, + "grad_norm": 0.5312227606773376, + "learning_rate": 9.087428658466009e-06, + "loss": 0.0036, + "step": 741 + }, + { + "epoch": 0.6579472400798049, + "grad_norm": 0.8712502121925354, + "learning_rate": 9.084669399178945e-06, + "loss": 0.0074, + "step": 742 + }, + { + "epoch": 0.6588339614276214, + "grad_norm": 1.0163110494613647, + "learning_rate": 9.081906394957924e-06, + "loss": 0.0136, + "step": 743 + }, + { + "epoch": 0.6597206827754378, + "grad_norm": 0.27555593848228455, + "learning_rate": 9.079139648336142e-06, + "loss": 0.0013, + "step": 744 + }, + { + "epoch": 0.6606074041232542, + "grad_norm": 0.892341136932373, + "learning_rate": 9.076369161850227e-06, + "loss": 0.009, + "step": 745 + }, + { + "epoch": 0.6614941254710707, + "grad_norm": 0.9638225436210632, + "learning_rate": 9.073594938040231e-06, + "loss": 0.0083, + "step": 746 + }, + { + "epoch": 0.6623808468188872, + "grad_norm": 0.9928414821624756, + "learning_rate": 9.070816979449642e-06, + "loss": 0.0113, + "step": 747 + }, + { + "epoch": 0.6632675681667036, + "grad_norm": 0.5321499705314636, + "learning_rate": 9.068035288625363e-06, + "loss": 0.0094, + "step": 748 + }, + { + "epoch": 0.66415428951452, + "grad_norm": 0.3631543517112732, + "learning_rate": 9.065249868117723e-06, + "loss": 0.0042, + "step": 749 + }, + { + "epoch": 0.6650410108623365, + "grad_norm": 1.0786023139953613, + "learning_rate": 9.06246072048047e-06, + "loss": 0.007, + "step": 750 + }, + { + "epoch": 0.665927732210153, + "grad_norm": 0.5387954115867615, + "learning_rate": 9.05966784827077e-06, + "loss": 0.0068, + "step": 751 + }, + { + "epoch": 0.6668144535579694, + "grad_norm": 0.14312922954559326, + "learning_rate": 9.0568712540492e-06, + "loss": 0.0016, + "step": 752 + }, + { + "epoch": 0.6677011749057858, + "grad_norm": 0.19700045883655548, + "learning_rate": 9.054070940379756e-06, + "loss": 0.0022, + "step": 753 + }, + { + "epoch": 0.6685878962536023, + "grad_norm": 0.24065862596035004, + "learning_rate": 9.051266909829838e-06, + "loss": 0.0027, + "step": 754 + }, + { + "epoch": 0.6694746176014188, + "grad_norm": 0.7865183353424072, + "learning_rate": 9.048459164970255e-06, + "loss": 0.0125, + "step": 755 + }, + { + "epoch": 0.6703613389492352, + "grad_norm": 0.4594501554965973, + "learning_rate": 9.045647708375226e-06, + "loss": 0.0071, + "step": 756 + }, + { + "epoch": 0.6712480602970516, + "grad_norm": 0.41208380460739136, + "learning_rate": 9.042832542622369e-06, + "loss": 0.0044, + "step": 757 + }, + { + "epoch": 0.6721347816448681, + "grad_norm": 0.49289634823799133, + "learning_rate": 9.0400136702927e-06, + "loss": 0.0101, + "step": 758 + }, + { + "epoch": 0.6730215029926846, + "grad_norm": 0.605767011642456, + "learning_rate": 9.037191093970636e-06, + "loss": 0.0078, + "step": 759 + }, + { + "epoch": 0.673908224340501, + "grad_norm": 0.5509247779846191, + "learning_rate": 9.034364816243995e-06, + "loss": 0.0053, + "step": 760 + }, + { + "epoch": 0.6747949456883174, + "grad_norm": 0.36483317613601685, + "learning_rate": 9.03153483970398e-06, + "loss": 0.0053, + "step": 761 + }, + { + "epoch": 0.6756816670361339, + "grad_norm": 0.7200064063072205, + "learning_rate": 9.028701166945191e-06, + "loss": 0.0059, + "step": 762 + }, + { + "epoch": 0.6765683883839504, + "grad_norm": 0.3441831171512604, + "learning_rate": 9.025863800565614e-06, + "loss": 0.0062, + "step": 763 + }, + { + "epoch": 0.6774551097317668, + "grad_norm": 0.3961979150772095, + "learning_rate": 9.023022743166623e-06, + "loss": 0.0034, + "step": 764 + }, + { + "epoch": 0.6783418310795832, + "grad_norm": 0.16061465442180634, + "learning_rate": 9.020177997352973e-06, + "loss": 0.0013, + "step": 765 + }, + { + "epoch": 0.6792285524273997, + "grad_norm": 0.8293284773826599, + "learning_rate": 9.017329565732806e-06, + "loss": 0.0085, + "step": 766 + }, + { + "epoch": 0.6801152737752162, + "grad_norm": 0.32432472705841064, + "learning_rate": 9.014477450917637e-06, + "loss": 0.0029, + "step": 767 + }, + { + "epoch": 0.6810019951230326, + "grad_norm": 0.48310521245002747, + "learning_rate": 9.011621655522365e-06, + "loss": 0.0046, + "step": 768 + }, + { + "epoch": 0.681888716470849, + "grad_norm": 0.5245798230171204, + "learning_rate": 9.008762182165256e-06, + "loss": 0.0072, + "step": 769 + }, + { + "epoch": 0.6827754378186655, + "grad_norm": 0.42478105425834656, + "learning_rate": 9.005899033467952e-06, + "loss": 0.0043, + "step": 770 + }, + { + "epoch": 0.683662159166482, + "grad_norm": 0.23128758370876312, + "learning_rate": 9.003032212055467e-06, + "loss": 0.0027, + "step": 771 + }, + { + "epoch": 0.6845488805142984, + "grad_norm": 0.1778474599123001, + "learning_rate": 9.000161720556177e-06, + "loss": 0.0019, + "step": 772 + }, + { + "epoch": 0.6854356018621148, + "grad_norm": 0.6820386648178101, + "learning_rate": 8.997287561601825e-06, + "loss": 0.0076, + "step": 773 + }, + { + "epoch": 0.6863223232099313, + "grad_norm": 0.5324229001998901, + "learning_rate": 8.994409737827516e-06, + "loss": 0.0086, + "step": 774 + }, + { + "epoch": 0.6872090445577477, + "grad_norm": 0.9875842928886414, + "learning_rate": 8.991528251871717e-06, + "loss": 0.0101, + "step": 775 + }, + { + "epoch": 0.6880957659055642, + "grad_norm": 0.3629169464111328, + "learning_rate": 8.988643106376252e-06, + "loss": 0.0028, + "step": 776 + }, + { + "epoch": 0.6889824872533806, + "grad_norm": 0.5612802505493164, + "learning_rate": 8.985754303986298e-06, + "loss": 0.0053, + "step": 777 + }, + { + "epoch": 0.689869208601197, + "grad_norm": 0.327252060174942, + "learning_rate": 8.982861847350388e-06, + "loss": 0.0022, + "step": 778 + }, + { + "epoch": 0.6907559299490135, + "grad_norm": 0.6719294786453247, + "learning_rate": 8.979965739120402e-06, + "loss": 0.0089, + "step": 779 + }, + { + "epoch": 0.69164265129683, + "grad_norm": 0.5353887677192688, + "learning_rate": 8.977065981951567e-06, + "loss": 0.0132, + "step": 780 + }, + { + "epoch": 0.6925293726446464, + "grad_norm": 0.4548264443874359, + "learning_rate": 8.974162578502463e-06, + "loss": 0.0053, + "step": 781 + }, + { + "epoch": 0.6934160939924628, + "grad_norm": 0.5508539080619812, + "learning_rate": 8.971255531435004e-06, + "loss": 0.004, + "step": 782 + }, + { + "epoch": 0.6943028153402793, + "grad_norm": 0.24120058119297028, + "learning_rate": 8.96834484341445e-06, + "loss": 0.0029, + "step": 783 + }, + { + "epoch": 0.6951895366880958, + "grad_norm": 0.562086820602417, + "learning_rate": 8.965430517109397e-06, + "loss": 0.0065, + "step": 784 + }, + { + "epoch": 0.6960762580359122, + "grad_norm": 0.3117201328277588, + "learning_rate": 8.962512555191776e-06, + "loss": 0.0022, + "step": 785 + }, + { + "epoch": 0.6969629793837286, + "grad_norm": 0.4886745810508728, + "learning_rate": 8.959590960336854e-06, + "loss": 0.0136, + "step": 786 + }, + { + "epoch": 0.6978497007315451, + "grad_norm": 0.2502830922603607, + "learning_rate": 8.956665735223227e-06, + "loss": 0.0027, + "step": 787 + }, + { + "epoch": 0.6987364220793616, + "grad_norm": 0.8199619650840759, + "learning_rate": 8.953736882532819e-06, + "loss": 0.0068, + "step": 788 + }, + { + "epoch": 0.699623143427178, + "grad_norm": 0.48591238260269165, + "learning_rate": 8.95080440495088e-06, + "loss": 0.0043, + "step": 789 + }, + { + "epoch": 0.7005098647749944, + "grad_norm": 0.6423205137252808, + "learning_rate": 8.947868305165985e-06, + "loss": 0.0033, + "step": 790 + }, + { + "epoch": 0.7013965861228109, + "grad_norm": 0.122403085231781, + "learning_rate": 8.944928585870028e-06, + "loss": 0.0014, + "step": 791 + }, + { + "epoch": 0.7022833074706274, + "grad_norm": 0.41814419627189636, + "learning_rate": 8.941985249758221e-06, + "loss": 0.0066, + "step": 792 + }, + { + "epoch": 0.7031700288184438, + "grad_norm": 0.6854628324508667, + "learning_rate": 8.939038299529094e-06, + "loss": 0.0102, + "step": 793 + }, + { + "epoch": 0.7040567501662602, + "grad_norm": 0.3310871720314026, + "learning_rate": 8.936087737884492e-06, + "loss": 0.0058, + "step": 794 + }, + { + "epoch": 0.7049434715140767, + "grad_norm": 0.34315598011016846, + "learning_rate": 8.933133567529564e-06, + "loss": 0.0062, + "step": 795 + }, + { + "epoch": 0.7058301928618932, + "grad_norm": 0.17736274003982544, + "learning_rate": 8.930175791172776e-06, + "loss": 0.002, + "step": 796 + }, + { + "epoch": 0.7067169142097096, + "grad_norm": 0.8943119645118713, + "learning_rate": 8.927214411525895e-06, + "loss": 0.014, + "step": 797 + }, + { + "epoch": 0.707603635557526, + "grad_norm": 0.6538609862327576, + "learning_rate": 8.924249431303994e-06, + "loss": 0.0119, + "step": 798 + }, + { + "epoch": 0.7084903569053425, + "grad_norm": 1.4449471235275269, + "learning_rate": 8.921280853225443e-06, + "loss": 0.0204, + "step": 799 + }, + { + "epoch": 0.709377078253159, + "grad_norm": 0.2964303493499756, + "learning_rate": 8.918308680011915e-06, + "loss": 0.0065, + "step": 800 + }, + { + "epoch": 0.7102637996009754, + "grad_norm": 0.912601113319397, + "learning_rate": 8.915332914388382e-06, + "loss": 0.0083, + "step": 801 + }, + { + "epoch": 0.7111505209487918, + "grad_norm": 0.9547178149223328, + "learning_rate": 8.912353559083098e-06, + "loss": 0.0111, + "step": 802 + }, + { + "epoch": 0.7120372422966083, + "grad_norm": 0.3429473042488098, + "learning_rate": 8.909370616827621e-06, + "loss": 0.0057, + "step": 803 + }, + { + "epoch": 0.7129239636444248, + "grad_norm": 0.713907778263092, + "learning_rate": 8.90638409035679e-06, + "loss": 0.006, + "step": 804 + }, + { + "epoch": 0.7138106849922412, + "grad_norm": 0.40096068382263184, + "learning_rate": 8.903393982408734e-06, + "loss": 0.0098, + "step": 805 + }, + { + "epoch": 0.7146974063400576, + "grad_norm": 0.538386344909668, + "learning_rate": 8.90040029572486e-06, + "loss": 0.0051, + "step": 806 + }, + { + "epoch": 0.715584127687874, + "grad_norm": 1.1172994375228882, + "learning_rate": 8.897403033049866e-06, + "loss": 0.0154, + "step": 807 + }, + { + "epoch": 0.7164708490356906, + "grad_norm": 0.31477785110473633, + "learning_rate": 8.894402197131715e-06, + "loss": 0.0059, + "step": 808 + }, + { + "epoch": 0.717357570383507, + "grad_norm": 1.1198912858963013, + "learning_rate": 8.89139779072166e-06, + "loss": 0.0074, + "step": 809 + }, + { + "epoch": 0.7182442917313234, + "grad_norm": 0.4623129963874817, + "learning_rate": 8.888389816574214e-06, + "loss": 0.0067, + "step": 810 + }, + { + "epoch": 0.7191310130791398, + "grad_norm": 0.400272011756897, + "learning_rate": 8.885378277447176e-06, + "loss": 0.0125, + "step": 811 + }, + { + "epoch": 0.7200177344269564, + "grad_norm": 0.4467921257019043, + "learning_rate": 8.882363176101601e-06, + "loss": 0.0122, + "step": 812 + }, + { + "epoch": 0.7209044557747728, + "grad_norm": 0.26177653670310974, + "learning_rate": 8.879344515301814e-06, + "loss": 0.0032, + "step": 813 + }, + { + "epoch": 0.7217911771225892, + "grad_norm": 0.3401440978050232, + "learning_rate": 8.876322297815406e-06, + "loss": 0.0058, + "step": 814 + }, + { + "epoch": 0.7226778984704056, + "grad_norm": 0.5830419063568115, + "learning_rate": 8.873296526413224e-06, + "loss": 0.0069, + "step": 815 + }, + { + "epoch": 0.7235646198182222, + "grad_norm": 0.4246460795402527, + "learning_rate": 8.870267203869379e-06, + "loss": 0.009, + "step": 816 + }, + { + "epoch": 0.7244513411660386, + "grad_norm": 0.3305572271347046, + "learning_rate": 8.867234332961232e-06, + "loss": 0.0052, + "step": 817 + }, + { + "epoch": 0.725338062513855, + "grad_norm": 0.23400214314460754, + "learning_rate": 8.864197916469404e-06, + "loss": 0.004, + "step": 818 + }, + { + "epoch": 0.7262247838616714, + "grad_norm": 0.46885445713996887, + "learning_rate": 8.861157957177757e-06, + "loss": 0.0044, + "step": 819 + }, + { + "epoch": 0.727111505209488, + "grad_norm": 0.17335279285907745, + "learning_rate": 8.858114457873411e-06, + "loss": 0.0017, + "step": 820 + }, + { + "epoch": 0.7279982265573044, + "grad_norm": 0.42308181524276733, + "learning_rate": 8.855067421346727e-06, + "loss": 0.0066, + "step": 821 + }, + { + "epoch": 0.7288849479051208, + "grad_norm": 0.29205307364463806, + "learning_rate": 8.852016850391309e-06, + "loss": 0.0022, + "step": 822 + }, + { + "epoch": 0.7297716692529372, + "grad_norm": 0.7200466990470886, + "learning_rate": 8.848962747804e-06, + "loss": 0.0052, + "step": 823 + }, + { + "epoch": 0.7306583906007537, + "grad_norm": 0.43672290444374084, + "learning_rate": 8.845905116384885e-06, + "loss": 0.0044, + "step": 824 + }, + { + "epoch": 0.7315451119485702, + "grad_norm": 0.07030118256807327, + "learning_rate": 8.842843958937282e-06, + "loss": 0.0004, + "step": 825 + }, + { + "epoch": 0.7324318332963866, + "grad_norm": 0.30260026454925537, + "learning_rate": 8.839779278267742e-06, + "loss": 0.0014, + "step": 826 + }, + { + "epoch": 0.733318554644203, + "grad_norm": 0.479753702878952, + "learning_rate": 8.836711077186045e-06, + "loss": 0.0025, + "step": 827 + }, + { + "epoch": 0.7342052759920195, + "grad_norm": 0.42094236612319946, + "learning_rate": 8.833639358505202e-06, + "loss": 0.0035, + "step": 828 + }, + { + "epoch": 0.735091997339836, + "grad_norm": 0.7436134815216064, + "learning_rate": 8.830564125041443e-06, + "loss": 0.0041, + "step": 829 + }, + { + "epoch": 0.7359787186876524, + "grad_norm": 0.3896116018295288, + "learning_rate": 8.827485379614224e-06, + "loss": 0.0021, + "step": 830 + }, + { + "epoch": 0.7368654400354688, + "grad_norm": 0.4691396653652191, + "learning_rate": 8.824403125046225e-06, + "loss": 0.0075, + "step": 831 + }, + { + "epoch": 0.7377521613832853, + "grad_norm": 0.3711012303829193, + "learning_rate": 8.821317364163338e-06, + "loss": 0.0017, + "step": 832 + }, + { + "epoch": 0.7386388827311018, + "grad_norm": 0.4321559965610504, + "learning_rate": 8.81822809979467e-06, + "loss": 0.0047, + "step": 833 + }, + { + "epoch": 0.7395256040789182, + "grad_norm": 0.9089894890785217, + "learning_rate": 8.815135334772539e-06, + "loss": 0.0032, + "step": 834 + }, + { + "epoch": 0.7404123254267346, + "grad_norm": 0.034477680921554565, + "learning_rate": 8.81203907193248e-06, + "loss": 0.0002, + "step": 835 + }, + { + "epoch": 0.741299046774551, + "grad_norm": 0.4759688675403595, + "learning_rate": 8.808939314113225e-06, + "loss": 0.0115, + "step": 836 + }, + { + "epoch": 0.7421857681223676, + "grad_norm": 0.6464022397994995, + "learning_rate": 8.805836064156716e-06, + "loss": 0.0041, + "step": 837 + }, + { + "epoch": 0.743072489470184, + "grad_norm": 0.7758044004440308, + "learning_rate": 8.802729324908095e-06, + "loss": 0.013, + "step": 838 + }, + { + "epoch": 0.7439592108180004, + "grad_norm": 0.10477610677480698, + "learning_rate": 8.799619099215705e-06, + "loss": 0.0005, + "step": 839 + }, + { + "epoch": 0.7448459321658168, + "grad_norm": 1.6846563816070557, + "learning_rate": 8.796505389931082e-06, + "loss": 0.0206, + "step": 840 + }, + { + "epoch": 0.7457326535136334, + "grad_norm": 0.13585907220840454, + "learning_rate": 8.793388199908959e-06, + "loss": 0.0009, + "step": 841 + }, + { + "epoch": 0.7466193748614498, + "grad_norm": 0.3324284553527832, + "learning_rate": 8.79026753200726e-06, + "loss": 0.0037, + "step": 842 + }, + { + "epoch": 0.7475060962092662, + "grad_norm": 0.21177026629447937, + "learning_rate": 8.787143389087093e-06, + "loss": 0.0017, + "step": 843 + }, + { + "epoch": 0.7483928175570826, + "grad_norm": 0.45269775390625, + "learning_rate": 8.784015774012758e-06, + "loss": 0.0077, + "step": 844 + }, + { + "epoch": 0.7492795389048992, + "grad_norm": 0.2514367699623108, + "learning_rate": 8.780884689651738e-06, + "loss": 0.0032, + "step": 845 + }, + { + "epoch": 0.7501662602527156, + "grad_norm": 0.4557270109653473, + "learning_rate": 8.77775013887469e-06, + "loss": 0.0057, + "step": 846 + }, + { + "epoch": 0.751052981600532, + "grad_norm": 0.4305650293827057, + "learning_rate": 8.77461212455546e-06, + "loss": 0.0039, + "step": 847 + }, + { + "epoch": 0.7519397029483484, + "grad_norm": 0.3271408975124359, + "learning_rate": 8.771470649571056e-06, + "loss": 0.0032, + "step": 848 + }, + { + "epoch": 0.752826424296165, + "grad_norm": 0.8781595230102539, + "learning_rate": 8.768325716801673e-06, + "loss": 0.0094, + "step": 849 + }, + { + "epoch": 0.7537131456439814, + "grad_norm": 0.28247398138046265, + "learning_rate": 8.765177329130665e-06, + "loss": 0.0063, + "step": 850 + }, + { + "epoch": 0.7545998669917978, + "grad_norm": 0.5735777020454407, + "learning_rate": 8.76202548944456e-06, + "loss": 0.0037, + "step": 851 + }, + { + "epoch": 0.7554865883396142, + "grad_norm": 0.14849910140037537, + "learning_rate": 8.758870200633047e-06, + "loss": 0.0011, + "step": 852 + }, + { + "epoch": 0.7563733096874308, + "grad_norm": 0.23138736188411713, + "learning_rate": 8.755711465588981e-06, + "loss": 0.0062, + "step": 853 + }, + { + "epoch": 0.7572600310352472, + "grad_norm": 0.2745361924171448, + "learning_rate": 8.752549287208373e-06, + "loss": 0.0017, + "step": 854 + }, + { + "epoch": 0.7581467523830636, + "grad_norm": 0.31626009941101074, + "learning_rate": 8.749383668390393e-06, + "loss": 0.0026, + "step": 855 + }, + { + "epoch": 0.75903347373088, + "grad_norm": 0.29030266404151917, + "learning_rate": 8.746214612037365e-06, + "loss": 0.0055, + "step": 856 + }, + { + "epoch": 0.7599201950786966, + "grad_norm": 0.4959680736064911, + "learning_rate": 8.743042121054766e-06, + "loss": 0.0041, + "step": 857 + }, + { + "epoch": 0.760806916426513, + "grad_norm": 0.1988179087638855, + "learning_rate": 8.739866198351218e-06, + "loss": 0.0017, + "step": 858 + }, + { + "epoch": 0.7616936377743294, + "grad_norm": 0.06984910368919373, + "learning_rate": 8.736686846838491e-06, + "loss": 0.0005, + "step": 859 + }, + { + "epoch": 0.7625803591221458, + "grad_norm": 0.7530263662338257, + "learning_rate": 8.733504069431502e-06, + "loss": 0.0099, + "step": 860 + }, + { + "epoch": 0.7634670804699624, + "grad_norm": 0.47911015152931213, + "learning_rate": 8.730317869048306e-06, + "loss": 0.005, + "step": 861 + }, + { + "epoch": 0.7643538018177788, + "grad_norm": 0.7026636600494385, + "learning_rate": 8.727128248610096e-06, + "loss": 0.0083, + "step": 862 + }, + { + "epoch": 0.7652405231655952, + "grad_norm": 0.5120250582695007, + "learning_rate": 8.723935211041198e-06, + "loss": 0.006, + "step": 863 + }, + { + "epoch": 0.7661272445134116, + "grad_norm": 0.37591156363487244, + "learning_rate": 8.72073875926908e-06, + "loss": 0.003, + "step": 864 + }, + { + "epoch": 0.7670139658612282, + "grad_norm": 0.333332896232605, + "learning_rate": 8.717538896224333e-06, + "loss": 0.0045, + "step": 865 + }, + { + "epoch": 0.7679006872090446, + "grad_norm": 0.3260776996612549, + "learning_rate": 8.714335624840674e-06, + "loss": 0.003, + "step": 866 + }, + { + "epoch": 0.768787408556861, + "grad_norm": 0.44862043857574463, + "learning_rate": 8.711128948054947e-06, + "loss": 0.007, + "step": 867 + }, + { + "epoch": 0.7696741299046774, + "grad_norm": 0.8102560639381409, + "learning_rate": 8.707918868807123e-06, + "loss": 0.0022, + "step": 868 + }, + { + "epoch": 0.7705608512524939, + "grad_norm": 0.3125271797180176, + "learning_rate": 8.704705390040285e-06, + "loss": 0.0044, + "step": 869 + }, + { + "epoch": 0.7714475726003104, + "grad_norm": 0.35220733284950256, + "learning_rate": 8.701488514700638e-06, + "loss": 0.003, + "step": 870 + }, + { + "epoch": 0.7723342939481268, + "grad_norm": 0.32635432481765747, + "learning_rate": 8.6982682457375e-06, + "loss": 0.0012, + "step": 871 + }, + { + "epoch": 0.7732210152959432, + "grad_norm": 0.27862560749053955, + "learning_rate": 8.695044586103297e-06, + "loss": 0.0024, + "step": 872 + }, + { + "epoch": 0.7741077366437596, + "grad_norm": 0.11573431640863419, + "learning_rate": 8.691817538753566e-06, + "loss": 0.0005, + "step": 873 + }, + { + "epoch": 0.7749944579915762, + "grad_norm": 1.2635929584503174, + "learning_rate": 8.688587106646953e-06, + "loss": 0.0134, + "step": 874 + }, + { + "epoch": 0.7758811793393926, + "grad_norm": 0.4768292009830475, + "learning_rate": 8.685353292745203e-06, + "loss": 0.0062, + "step": 875 + }, + { + "epoch": 0.776767900687209, + "grad_norm": 0.44109392166137695, + "learning_rate": 8.682116100013163e-06, + "loss": 0.0058, + "step": 876 + }, + { + "epoch": 0.7776546220350254, + "grad_norm": 0.13942331075668335, + "learning_rate": 8.678875531418778e-06, + "loss": 0.0012, + "step": 877 + }, + { + "epoch": 0.778541343382842, + "grad_norm": 0.3853679597377777, + "learning_rate": 8.675631589933086e-06, + "loss": 0.0044, + "step": 878 + }, + { + "epoch": 0.7794280647306584, + "grad_norm": 0.7974981069564819, + "learning_rate": 8.672384278530222e-06, + "loss": 0.0141, + "step": 879 + }, + { + "epoch": 0.7803147860784748, + "grad_norm": 0.5448789000511169, + "learning_rate": 8.669133600187406e-06, + "loss": 0.0072, + "step": 880 + }, + { + "epoch": 0.7812015074262912, + "grad_norm": 0.6990935802459717, + "learning_rate": 8.665879557884946e-06, + "loss": 0.0113, + "step": 881 + }, + { + "epoch": 0.7820882287741078, + "grad_norm": 0.3424188494682312, + "learning_rate": 8.662622154606238e-06, + "loss": 0.0046, + "step": 882 + }, + { + "epoch": 0.7829749501219242, + "grad_norm": 0.22132602334022522, + "learning_rate": 8.659361393337752e-06, + "loss": 0.0023, + "step": 883 + }, + { + "epoch": 0.7838616714697406, + "grad_norm": 0.4831432104110718, + "learning_rate": 8.656097277069044e-06, + "loss": 0.0041, + "step": 884 + }, + { + "epoch": 0.784748392817557, + "grad_norm": 0.5113421082496643, + "learning_rate": 8.652829808792742e-06, + "loss": 0.0093, + "step": 885 + }, + { + "epoch": 0.7856351141653736, + "grad_norm": 0.36989861726760864, + "learning_rate": 8.64955899150455e-06, + "loss": 0.0039, + "step": 886 + }, + { + "epoch": 0.78652183551319, + "grad_norm": 0.3535897433757782, + "learning_rate": 8.646284828203236e-06, + "loss": 0.0124, + "step": 887 + }, + { + "epoch": 0.7874085568610064, + "grad_norm": 0.6427195072174072, + "learning_rate": 8.643007321890646e-06, + "loss": 0.005, + "step": 888 + }, + { + "epoch": 0.7882952782088228, + "grad_norm": 0.12736086547374725, + "learning_rate": 8.63972647557168e-06, + "loss": 0.0021, + "step": 889 + }, + { + "epoch": 0.7891819995566394, + "grad_norm": 0.1027836948633194, + "learning_rate": 8.63644229225431e-06, + "loss": 0.0017, + "step": 890 + }, + { + "epoch": 0.7900687209044558, + "grad_norm": 0.41020867228507996, + "learning_rate": 8.633154774949559e-06, + "loss": 0.0076, + "step": 891 + }, + { + "epoch": 0.7909554422522722, + "grad_norm": 0.2587355375289917, + "learning_rate": 8.629863926671514e-06, + "loss": 0.0027, + "step": 892 + }, + { + "epoch": 0.7918421636000886, + "grad_norm": 0.32279083132743835, + "learning_rate": 8.626569750437312e-06, + "loss": 0.0027, + "step": 893 + }, + { + "epoch": 0.7927288849479052, + "grad_norm": 0.26439985632896423, + "learning_rate": 8.623272249267139e-06, + "loss": 0.002, + "step": 894 + }, + { + "epoch": 0.7936156062957216, + "grad_norm": 0.4109850823879242, + "learning_rate": 8.619971426184236e-06, + "loss": 0.0094, + "step": 895 + }, + { + "epoch": 0.794502327643538, + "grad_norm": 0.2232225239276886, + "learning_rate": 8.616667284214882e-06, + "loss": 0.0037, + "step": 896 + }, + { + "epoch": 0.7953890489913544, + "grad_norm": 0.4170536696910858, + "learning_rate": 8.613359826388404e-06, + "loss": 0.0108, + "step": 897 + }, + { + "epoch": 0.796275770339171, + "grad_norm": 0.3371315598487854, + "learning_rate": 8.610049055737168e-06, + "loss": 0.0026, + "step": 898 + }, + { + "epoch": 0.7971624916869874, + "grad_norm": 0.6959087252616882, + "learning_rate": 8.606734975296578e-06, + "loss": 0.0085, + "step": 899 + }, + { + "epoch": 0.7980492130348038, + "grad_norm": 0.38720524311065674, + "learning_rate": 8.60341758810507e-06, + "loss": 0.0024, + "step": 900 + }, + { + "epoch": 0.7989359343826202, + "grad_norm": 0.39712467789649963, + "learning_rate": 8.600096897204113e-06, + "loss": 0.0038, + "step": 901 + }, + { + "epoch": 0.7998226557304368, + "grad_norm": 0.2283264547586441, + "learning_rate": 8.596772905638207e-06, + "loss": 0.0017, + "step": 902 + }, + { + "epoch": 0.8007093770782532, + "grad_norm": 0.13432735204696655, + "learning_rate": 8.593445616454873e-06, + "loss": 0.0018, + "step": 903 + }, + { + "epoch": 0.8015960984260696, + "grad_norm": 0.35728439688682556, + "learning_rate": 8.59011503270466e-06, + "loss": 0.0057, + "step": 904 + }, + { + "epoch": 0.802482819773886, + "grad_norm": 0.6101589798927307, + "learning_rate": 8.586781157441138e-06, + "loss": 0.0057, + "step": 905 + }, + { + "epoch": 0.8033695411217026, + "grad_norm": 0.2695213854312897, + "learning_rate": 8.58344399372089e-06, + "loss": 0.0025, + "step": 906 + }, + { + "epoch": 0.804256262469519, + "grad_norm": 0.37189701199531555, + "learning_rate": 8.580103544603517e-06, + "loss": 0.0043, + "step": 907 + }, + { + "epoch": 0.8051429838173354, + "grad_norm": 0.4200766682624817, + "learning_rate": 8.576759813151635e-06, + "loss": 0.0017, + "step": 908 + }, + { + "epoch": 0.8060297051651518, + "grad_norm": 0.31807032227516174, + "learning_rate": 8.573412802430863e-06, + "loss": 0.003, + "step": 909 + }, + { + "epoch": 0.8069164265129684, + "grad_norm": 0.6575264930725098, + "learning_rate": 8.57006251550983e-06, + "loss": 0.0048, + "step": 910 + }, + { + "epoch": 0.8078031478607848, + "grad_norm": 0.33519044518470764, + "learning_rate": 8.566708955460172e-06, + "loss": 0.0043, + "step": 911 + }, + { + "epoch": 0.8086898692086012, + "grad_norm": 0.7663727402687073, + "learning_rate": 8.56335212535652e-06, + "loss": 0.0068, + "step": 912 + }, + { + "epoch": 0.8095765905564176, + "grad_norm": 0.2573295831680298, + "learning_rate": 8.559992028276502e-06, + "loss": 0.0051, + "step": 913 + }, + { + "epoch": 0.810463311904234, + "grad_norm": 0.49369877576828003, + "learning_rate": 8.556628667300752e-06, + "loss": 0.0118, + "step": 914 + }, + { + "epoch": 0.8113500332520506, + "grad_norm": 0.10165636241436005, + "learning_rate": 8.553262045512882e-06, + "loss": 0.0004, + "step": 915 + }, + { + "epoch": 0.812236754599867, + "grad_norm": 0.7126047611236572, + "learning_rate": 8.549892165999505e-06, + "loss": 0.0069, + "step": 916 + }, + { + "epoch": 0.8131234759476834, + "grad_norm": 0.2403372973203659, + "learning_rate": 8.546519031850216e-06, + "loss": 0.0026, + "step": 917 + }, + { + "epoch": 0.8140101972954998, + "grad_norm": 0.5325596332550049, + "learning_rate": 8.543142646157594e-06, + "loss": 0.0066, + "step": 918 + }, + { + "epoch": 0.8148969186433164, + "grad_norm": 0.09692367166280746, + "learning_rate": 8.5397630120172e-06, + "loss": 0.001, + "step": 919 + }, + { + "epoch": 0.8157836399911328, + "grad_norm": 0.43505728244781494, + "learning_rate": 8.536380132527572e-06, + "loss": 0.0049, + "step": 920 + }, + { + "epoch": 0.8166703613389492, + "grad_norm": 1.0189909934997559, + "learning_rate": 8.53299401079022e-06, + "loss": 0.0202, + "step": 921 + }, + { + "epoch": 0.8175570826867656, + "grad_norm": 0.151530921459198, + "learning_rate": 8.52960464990964e-06, + "loss": 0.0016, + "step": 922 + }, + { + "epoch": 0.8184438040345822, + "grad_norm": 0.11746247112751007, + "learning_rate": 8.526212052993284e-06, + "loss": 0.0017, + "step": 923 + }, + { + "epoch": 0.8193305253823986, + "grad_norm": 0.25555944442749023, + "learning_rate": 8.522816223151573e-06, + "loss": 0.0018, + "step": 924 + }, + { + "epoch": 0.820217246730215, + "grad_norm": 0.13405092060565948, + "learning_rate": 8.519417163497898e-06, + "loss": 0.0024, + "step": 925 + }, + { + "epoch": 0.8211039680780314, + "grad_norm": 0.31290844082832336, + "learning_rate": 8.516014877148605e-06, + "loss": 0.0037, + "step": 926 + }, + { + "epoch": 0.821990689425848, + "grad_norm": 0.13099424540996552, + "learning_rate": 8.512609367223005e-06, + "loss": 0.0018, + "step": 927 + }, + { + "epoch": 0.8228774107736644, + "grad_norm": 0.18229612708091736, + "learning_rate": 8.509200636843354e-06, + "loss": 0.0025, + "step": 928 + }, + { + "epoch": 0.8237641321214808, + "grad_norm": 0.20748937129974365, + "learning_rate": 8.505788689134872e-06, + "loss": 0.0044, + "step": 929 + }, + { + "epoch": 0.8246508534692972, + "grad_norm": 0.28571903705596924, + "learning_rate": 8.502373527225725e-06, + "loss": 0.0035, + "step": 930 + }, + { + "epoch": 0.8255375748171138, + "grad_norm": 0.0889572873711586, + "learning_rate": 8.49895515424702e-06, + "loss": 0.0018, + "step": 931 + }, + { + "epoch": 0.8264242961649302, + "grad_norm": 0.16316168010234833, + "learning_rate": 8.495533573332814e-06, + "loss": 0.0017, + "step": 932 + }, + { + "epoch": 0.8273110175127466, + "grad_norm": 0.23393435776233673, + "learning_rate": 8.492108787620106e-06, + "loss": 0.0032, + "step": 933 + }, + { + "epoch": 0.828197738860563, + "grad_norm": 0.49376270174980164, + "learning_rate": 8.488680800248828e-06, + "loss": 0.0086, + "step": 934 + }, + { + "epoch": 0.8290844602083796, + "grad_norm": 0.3670783042907715, + "learning_rate": 8.485249614361852e-06, + "loss": 0.0024, + "step": 935 + }, + { + "epoch": 0.829971181556196, + "grad_norm": 0.26419225335121155, + "learning_rate": 8.481815233104982e-06, + "loss": 0.0016, + "step": 936 + }, + { + "epoch": 0.8308579029040124, + "grad_norm": 0.343707799911499, + "learning_rate": 8.478377659626949e-06, + "loss": 0.005, + "step": 937 + }, + { + "epoch": 0.8317446242518288, + "grad_norm": 0.035789605230093, + "learning_rate": 8.47493689707941e-06, + "loss": 0.0003, + "step": 938 + }, + { + "epoch": 0.8326313455996454, + "grad_norm": 0.5384289622306824, + "learning_rate": 8.471492948616953e-06, + "loss": 0.0126, + "step": 939 + }, + { + "epoch": 0.8335180669474618, + "grad_norm": 0.26499831676483154, + "learning_rate": 8.468045817397076e-06, + "loss": 0.0015, + "step": 940 + }, + { + "epoch": 0.8344047882952782, + "grad_norm": 0.5022488236427307, + "learning_rate": 8.464595506580207e-06, + "loss": 0.0058, + "step": 941 + }, + { + "epoch": 0.8352915096430946, + "grad_norm": 0.8323764801025391, + "learning_rate": 8.461142019329677e-06, + "loss": 0.0088, + "step": 942 + }, + { + "epoch": 0.8361782309909112, + "grad_norm": 0.3151646852493286, + "learning_rate": 8.45768535881174e-06, + "loss": 0.0044, + "step": 943 + }, + { + "epoch": 0.8370649523387276, + "grad_norm": 0.19980305433273315, + "learning_rate": 8.454225528195553e-06, + "loss": 0.001, + "step": 944 + }, + { + "epoch": 0.837951673686544, + "grad_norm": 0.3161185383796692, + "learning_rate": 8.45076253065318e-06, + "loss": 0.0045, + "step": 945 + }, + { + "epoch": 0.8388383950343604, + "grad_norm": 0.42954885959625244, + "learning_rate": 8.447296369359593e-06, + "loss": 0.0016, + "step": 946 + }, + { + "epoch": 0.839725116382177, + "grad_norm": 0.23368383944034576, + "learning_rate": 8.443827047492657e-06, + "loss": 0.0016, + "step": 947 + }, + { + "epoch": 0.8406118377299934, + "grad_norm": 0.14157956838607788, + "learning_rate": 8.44035456823314e-06, + "loss": 0.0008, + "step": 948 + }, + { + "epoch": 0.8414985590778098, + "grad_norm": 0.8546500205993652, + "learning_rate": 8.436878934764705e-06, + "loss": 0.0051, + "step": 949 + }, + { + "epoch": 0.8423852804256262, + "grad_norm": 0.16669033467769623, + "learning_rate": 8.433400150273907e-06, + "loss": 0.0008, + "step": 950 + }, + { + "epoch": 0.8432720017734427, + "grad_norm": 0.1262633204460144, + "learning_rate": 8.429918217950184e-06, + "loss": 0.0008, + "step": 951 + }, + { + "epoch": 0.8441587231212592, + "grad_norm": 0.38040781021118164, + "learning_rate": 8.426433140985867e-06, + "loss": 0.0023, + "step": 952 + }, + { + "epoch": 0.8450454444690756, + "grad_norm": 0.2027808576822281, + "learning_rate": 8.42294492257617e-06, + "loss": 0.0014, + "step": 953 + }, + { + "epoch": 0.845932165816892, + "grad_norm": 1.2989972829818726, + "learning_rate": 8.419453565919183e-06, + "loss": 0.007, + "step": 954 + }, + { + "epoch": 0.8468188871647085, + "grad_norm": 0.4618437588214874, + "learning_rate": 8.415959074215871e-06, + "loss": 0.0033, + "step": 955 + }, + { + "epoch": 0.847705608512525, + "grad_norm": 0.41583216190338135, + "learning_rate": 8.412461450670083e-06, + "loss": 0.0073, + "step": 956 + }, + { + "epoch": 0.8485923298603414, + "grad_norm": 0.26886528730392456, + "learning_rate": 8.408960698488531e-06, + "loss": 0.0009, + "step": 957 + }, + { + "epoch": 0.8494790512081578, + "grad_norm": 0.6726037859916687, + "learning_rate": 8.405456820880797e-06, + "loss": 0.0083, + "step": 958 + }, + { + "epoch": 0.8503657725559742, + "grad_norm": 0.9834551811218262, + "learning_rate": 8.401949821059331e-06, + "loss": 0.0086, + "step": 959 + }, + { + "epoch": 0.8512524939037908, + "grad_norm": 0.29446637630462646, + "learning_rate": 8.398439702239447e-06, + "loss": 0.0011, + "step": 960 + }, + { + "epoch": 0.8521392152516072, + "grad_norm": 0.6566726565361023, + "learning_rate": 8.39492646763931e-06, + "loss": 0.0047, + "step": 961 + }, + { + "epoch": 0.8530259365994236, + "grad_norm": 0.18082024157047272, + "learning_rate": 8.391410120479952e-06, + "loss": 0.002, + "step": 962 + }, + { + "epoch": 0.85391265794724, + "grad_norm": 0.038998160511255264, + "learning_rate": 8.38789066398525e-06, + "loss": 0.0004, + "step": 963 + }, + { + "epoch": 0.8547993792950566, + "grad_norm": 0.19278480112552643, + "learning_rate": 8.384368101381939e-06, + "loss": 0.0047, + "step": 964 + }, + { + "epoch": 0.855686100642873, + "grad_norm": 0.36077624559402466, + "learning_rate": 8.380842435899595e-06, + "loss": 0.0054, + "step": 965 + }, + { + "epoch": 0.8565728219906894, + "grad_norm": 0.6876051425933838, + "learning_rate": 8.377313670770647e-06, + "loss": 0.0052, + "step": 966 + }, + { + "epoch": 0.8574595433385058, + "grad_norm": 0.2629197835922241, + "learning_rate": 8.373781809230355e-06, + "loss": 0.0022, + "step": 967 + }, + { + "epoch": 0.8583462646863224, + "grad_norm": 0.4707011282444, + "learning_rate": 8.37024685451683e-06, + "loss": 0.0068, + "step": 968 + }, + { + "epoch": 0.8592329860341388, + "grad_norm": 0.639545202255249, + "learning_rate": 8.366708809871009e-06, + "loss": 0.0087, + "step": 969 + }, + { + "epoch": 0.8601197073819552, + "grad_norm": 0.47514888644218445, + "learning_rate": 8.363167678536667e-06, + "loss": 0.0027, + "step": 970 + }, + { + "epoch": 0.8610064287297716, + "grad_norm": 0.32375380396842957, + "learning_rate": 8.359623463760408e-06, + "loss": 0.0033, + "step": 971 + }, + { + "epoch": 0.8618931500775882, + "grad_norm": 0.1954675018787384, + "learning_rate": 8.356076168791663e-06, + "loss": 0.0025, + "step": 972 + }, + { + "epoch": 0.8627798714254046, + "grad_norm": 0.4773697853088379, + "learning_rate": 8.352525796882685e-06, + "loss": 0.008, + "step": 973 + }, + { + "epoch": 0.863666592773221, + "grad_norm": 0.5595993399620056, + "learning_rate": 8.34897235128855e-06, + "loss": 0.0074, + "step": 974 + }, + { + "epoch": 0.8645533141210374, + "grad_norm": 0.6411868333816528, + "learning_rate": 8.345415835267154e-06, + "loss": 0.0071, + "step": 975 + }, + { + "epoch": 0.865440035468854, + "grad_norm": 0.2740042209625244, + "learning_rate": 8.341856252079206e-06, + "loss": 0.004, + "step": 976 + }, + { + "epoch": 0.8663267568166704, + "grad_norm": 0.43187814950942993, + "learning_rate": 8.338293604988223e-06, + "loss": 0.0031, + "step": 977 + }, + { + "epoch": 0.8672134781644868, + "grad_norm": 0.24849332869052887, + "learning_rate": 8.33472789726054e-06, + "loss": 0.0035, + "step": 978 + }, + { + "epoch": 0.8681001995123032, + "grad_norm": 0.24358658492565155, + "learning_rate": 8.331159132165287e-06, + "loss": 0.0016, + "step": 979 + }, + { + "epoch": 0.8689869208601197, + "grad_norm": 0.553687572479248, + "learning_rate": 8.327587312974411e-06, + "loss": 0.0057, + "step": 980 + }, + { + "epoch": 0.8698736422079362, + "grad_norm": 0.23114646971225739, + "learning_rate": 8.324012442962644e-06, + "loss": 0.0026, + "step": 981 + }, + { + "epoch": 0.8707603635557526, + "grad_norm": 0.12544259428977966, + "learning_rate": 8.320434525407524e-06, + "loss": 0.0015, + "step": 982 + }, + { + "epoch": 0.871647084903569, + "grad_norm": 0.7338998317718506, + "learning_rate": 8.316853563589383e-06, + "loss": 0.0144, + "step": 983 + }, + { + "epoch": 0.8725338062513855, + "grad_norm": 0.38078662753105164, + "learning_rate": 8.313269560791343e-06, + "loss": 0.0033, + "step": 984 + }, + { + "epoch": 0.873420527599202, + "grad_norm": 0.37246477603912354, + "learning_rate": 8.309682520299312e-06, + "loss": 0.0047, + "step": 985 + }, + { + "epoch": 0.8743072489470184, + "grad_norm": 0.2513933777809143, + "learning_rate": 8.306092445401984e-06, + "loss": 0.0028, + "step": 986 + }, + { + "epoch": 0.8751939702948348, + "grad_norm": 0.7336315512657166, + "learning_rate": 8.302499339390836e-06, + "loss": 0.011, + "step": 987 + }, + { + "epoch": 0.8760806916426513, + "grad_norm": 0.07155678421258926, + "learning_rate": 8.298903205560123e-06, + "loss": 0.0008, + "step": 988 + }, + { + "epoch": 0.8769674129904678, + "grad_norm": 0.4643585681915283, + "learning_rate": 8.295304047206878e-06, + "loss": 0.003, + "step": 989 + }, + { + "epoch": 0.8778541343382842, + "grad_norm": 0.6717381477355957, + "learning_rate": 8.291701867630906e-06, + "loss": 0.0063, + "step": 990 + }, + { + "epoch": 0.8787408556861006, + "grad_norm": 0.491864949464798, + "learning_rate": 8.28809667013478e-06, + "loss": 0.0067, + "step": 991 + }, + { + "epoch": 0.8796275770339171, + "grad_norm": 0.6167854070663452, + "learning_rate": 8.284488458023841e-06, + "loss": 0.0048, + "step": 992 + }, + { + "epoch": 0.8805142983817336, + "grad_norm": 0.5936487317085266, + "learning_rate": 8.280877234606193e-06, + "loss": 0.0035, + "step": 993 + }, + { + "epoch": 0.88140101972955, + "grad_norm": 0.7591543793678284, + "learning_rate": 8.277263003192706e-06, + "loss": 0.0104, + "step": 994 + }, + { + "epoch": 0.8822877410773664, + "grad_norm": 0.43168148398399353, + "learning_rate": 8.273645767097001e-06, + "loss": 0.0035, + "step": 995 + }, + { + "epoch": 0.8831744624251829, + "grad_norm": 0.2748768627643585, + "learning_rate": 8.270025529635455e-06, + "loss": 0.0029, + "step": 996 + }, + { + "epoch": 0.8840611837729994, + "grad_norm": 0.3771921992301941, + "learning_rate": 8.266402294127203e-06, + "loss": 0.0061, + "step": 997 + }, + { + "epoch": 0.8849479051208158, + "grad_norm": 0.5574544072151184, + "learning_rate": 8.26277606389412e-06, + "loss": 0.0078, + "step": 998 + }, + { + "epoch": 0.8858346264686322, + "grad_norm": 0.36976248025894165, + "learning_rate": 8.259146842260834e-06, + "loss": 0.0059, + "step": 999 + }, + { + "epoch": 0.8867213478164487, + "grad_norm": 0.9293411374092102, + "learning_rate": 8.255514632554709e-06, + "loss": 0.0095, + "step": 1000 + }, + { + "epoch": 0.8876080691642652, + "grad_norm": 0.5999060273170471, + "learning_rate": 8.251879438105854e-06, + "loss": 0.0062, + "step": 1001 + }, + { + "epoch": 0.8884947905120816, + "grad_norm": 0.6218252182006836, + "learning_rate": 8.24824126224711e-06, + "loss": 0.0092, + "step": 1002 + }, + { + "epoch": 0.889381511859898, + "grad_norm": 0.4173696041107178, + "learning_rate": 8.244600108314058e-06, + "loss": 0.0057, + "step": 1003 + }, + { + "epoch": 0.8902682332077144, + "grad_norm": 0.12720154225826263, + "learning_rate": 8.240955979645001e-06, + "loss": 0.0015, + "step": 1004 + }, + { + "epoch": 0.891154954555531, + "grad_norm": 0.8230817317962646, + "learning_rate": 8.237308879580974e-06, + "loss": 0.0071, + "step": 1005 + }, + { + "epoch": 0.8920416759033474, + "grad_norm": 0.7403504252433777, + "learning_rate": 8.23365881146574e-06, + "loss": 0.01, + "step": 1006 + }, + { + "epoch": 0.8929283972511638, + "grad_norm": 0.3044225871562958, + "learning_rate": 8.230005778645773e-06, + "loss": 0.0039, + "step": 1007 + }, + { + "epoch": 0.8938151185989802, + "grad_norm": 0.27469322085380554, + "learning_rate": 8.226349784470276e-06, + "loss": 0.0023, + "step": 1008 + }, + { + "epoch": 0.8947018399467968, + "grad_norm": 0.20195840299129486, + "learning_rate": 8.222690832291158e-06, + "loss": 0.0025, + "step": 1009 + }, + { + "epoch": 0.8955885612946132, + "grad_norm": 0.25450944900512695, + "learning_rate": 8.219028925463045e-06, + "loss": 0.0027, + "step": 1010 + }, + { + "epoch": 0.8964752826424296, + "grad_norm": 0.6547467708587646, + "learning_rate": 8.215364067343272e-06, + "loss": 0.0064, + "step": 1011 + }, + { + "epoch": 0.897362003990246, + "grad_norm": 0.3546174466609955, + "learning_rate": 8.211696261291879e-06, + "loss": 0.0039, + "step": 1012 + }, + { + "epoch": 0.8982487253380625, + "grad_norm": 0.7712265253067017, + "learning_rate": 8.208025510671609e-06, + "loss": 0.005, + "step": 1013 + }, + { + "epoch": 0.899135446685879, + "grad_norm": 0.07138879597187042, + "learning_rate": 8.204351818847902e-06, + "loss": 0.0008, + "step": 1014 + }, + { + "epoch": 0.9000221680336954, + "grad_norm": 0.2987552583217621, + "learning_rate": 8.200675189188899e-06, + "loss": 0.0025, + "step": 1015 + }, + { + "epoch": 0.9009088893815118, + "grad_norm": 0.5162558555603027, + "learning_rate": 8.19699562506543e-06, + "loss": 0.0024, + "step": 1016 + }, + { + "epoch": 0.9017956107293283, + "grad_norm": 0.1842619776725769, + "learning_rate": 8.193313129851019e-06, + "loss": 0.002, + "step": 1017 + }, + { + "epoch": 0.9026823320771448, + "grad_norm": 0.4205833375453949, + "learning_rate": 8.189627706921876e-06, + "loss": 0.0058, + "step": 1018 + }, + { + "epoch": 0.9035690534249612, + "grad_norm": 0.9251869916915894, + "learning_rate": 8.185939359656895e-06, + "loss": 0.0108, + "step": 1019 + }, + { + "epoch": 0.9044557747727776, + "grad_norm": 0.5171855092048645, + "learning_rate": 8.18224809143765e-06, + "loss": 0.0018, + "step": 1020 + }, + { + "epoch": 0.9053424961205941, + "grad_norm": 1.3423070907592773, + "learning_rate": 8.178553905648396e-06, + "loss": 0.0107, + "step": 1021 + }, + { + "epoch": 0.9062292174684106, + "grad_norm": 0.4659717381000519, + "learning_rate": 8.174856805676062e-06, + "loss": 0.0028, + "step": 1022 + }, + { + "epoch": 0.907115938816227, + "grad_norm": 0.1671382635831833, + "learning_rate": 8.171156794910245e-06, + "loss": 0.0014, + "step": 1023 + }, + { + "epoch": 0.9080026601640434, + "grad_norm": 0.921686589717865, + "learning_rate": 8.167453876743215e-06, + "loss": 0.0078, + "step": 1024 + }, + { + "epoch": 0.9088893815118599, + "grad_norm": 0.4190947413444519, + "learning_rate": 8.163748054569905e-06, + "loss": 0.0029, + "step": 1025 + }, + { + "epoch": 0.9097761028596764, + "grad_norm": 0.5790007710456848, + "learning_rate": 8.160039331787915e-06, + "loss": 0.0048, + "step": 1026 + }, + { + "epoch": 0.9106628242074928, + "grad_norm": 0.4046242833137512, + "learning_rate": 8.156327711797499e-06, + "loss": 0.0043, + "step": 1027 + }, + { + "epoch": 0.9115495455553092, + "grad_norm": 0.35926681756973267, + "learning_rate": 8.152613198001567e-06, + "loss": 0.0021, + "step": 1028 + }, + { + "epoch": 0.9124362669031257, + "grad_norm": 0.4275892376899719, + "learning_rate": 8.14889579380569e-06, + "loss": 0.005, + "step": 1029 + }, + { + "epoch": 0.9133229882509422, + "grad_norm": 0.5041569471359253, + "learning_rate": 8.14517550261808e-06, + "loss": 0.0032, + "step": 1030 + }, + { + "epoch": 0.9142097095987586, + "grad_norm": 0.7166852951049805, + "learning_rate": 8.141452327849596e-06, + "loss": 0.0091, + "step": 1031 + }, + { + "epoch": 0.915096430946575, + "grad_norm": 0.4294228255748749, + "learning_rate": 8.13772627291375e-06, + "loss": 0.0051, + "step": 1032 + }, + { + "epoch": 0.9159831522943915, + "grad_norm": 0.7514994144439697, + "learning_rate": 8.133997341226686e-06, + "loss": 0.014, + "step": 1033 + }, + { + "epoch": 0.916869873642208, + "grad_norm": 0.6899821162223816, + "learning_rate": 8.130265536207189e-06, + "loss": 0.0063, + "step": 1034 + }, + { + "epoch": 0.9177565949900244, + "grad_norm": 0.08081519603729248, + "learning_rate": 8.126530861276677e-06, + "loss": 0.0009, + "step": 1035 + }, + { + "epoch": 0.9186433163378408, + "grad_norm": 0.09478479623794556, + "learning_rate": 8.1227933198592e-06, + "loss": 0.0018, + "step": 1036 + }, + { + "epoch": 0.9195300376856573, + "grad_norm": 0.21334019303321838, + "learning_rate": 8.119052915381432e-06, + "loss": 0.0021, + "step": 1037 + }, + { + "epoch": 0.9204167590334738, + "grad_norm": 0.27267512679100037, + "learning_rate": 8.11530965127268e-06, + "loss": 0.0031, + "step": 1038 + }, + { + "epoch": 0.9213034803812902, + "grad_norm": 0.41256797313690186, + "learning_rate": 8.111563530964869e-06, + "loss": 0.004, + "step": 1039 + }, + { + "epoch": 0.9221902017291066, + "grad_norm": 0.9929311275482178, + "learning_rate": 8.107814557892539e-06, + "loss": 0.0042, + "step": 1040 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.3197706341743469, + "learning_rate": 8.104062735492852e-06, + "loss": 0.0071, + "step": 1041 + }, + { + "epoch": 0.9239636444247395, + "grad_norm": 1.6723576784133911, + "learning_rate": 8.100308067205576e-06, + "loss": 0.006, + "step": 1042 + }, + { + "epoch": 0.924850365772556, + "grad_norm": 0.22382718324661255, + "learning_rate": 8.096550556473092e-06, + "loss": 0.0019, + "step": 1043 + }, + { + "epoch": 0.9257370871203724, + "grad_norm": 0.2541332542896271, + "learning_rate": 8.092790206740386e-06, + "loss": 0.0021, + "step": 1044 + }, + { + "epoch": 0.9266238084681889, + "grad_norm": 0.22676189243793488, + "learning_rate": 8.089027021455047e-06, + "loss": 0.0048, + "step": 1045 + }, + { + "epoch": 0.9275105298160053, + "grad_norm": 0.14653460681438446, + "learning_rate": 8.085261004067264e-06, + "loss": 0.0012, + "step": 1046 + }, + { + "epoch": 0.9283972511638218, + "grad_norm": 0.5666484832763672, + "learning_rate": 8.081492158029824e-06, + "loss": 0.008, + "step": 1047 + }, + { + "epoch": 0.9292839725116382, + "grad_norm": 0.285400927066803, + "learning_rate": 8.077720486798102e-06, + "loss": 0.0032, + "step": 1048 + }, + { + "epoch": 0.9301706938594546, + "grad_norm": 0.5438045859336853, + "learning_rate": 8.07394599383007e-06, + "loss": 0.0024, + "step": 1049 + }, + { + "epoch": 0.9310574152072711, + "grad_norm": 0.37798741459846497, + "learning_rate": 8.070168682586283e-06, + "loss": 0.003, + "step": 1050 + }, + { + "epoch": 0.9319441365550876, + "grad_norm": 0.6178186535835266, + "learning_rate": 8.066388556529881e-06, + "loss": 0.0088, + "step": 1051 + }, + { + "epoch": 0.932830857902904, + "grad_norm": 0.39296531677246094, + "learning_rate": 8.062605619126585e-06, + "loss": 0.0044, + "step": 1052 + }, + { + "epoch": 0.9337175792507204, + "grad_norm": 0.47325724363327026, + "learning_rate": 8.05881987384469e-06, + "loss": 0.0043, + "step": 1053 + }, + { + "epoch": 0.9346043005985369, + "grad_norm": 0.7773491144180298, + "learning_rate": 8.055031324155072e-06, + "loss": 0.0097, + "step": 1054 + }, + { + "epoch": 0.9354910219463534, + "grad_norm": 0.5702770352363586, + "learning_rate": 8.05123997353117e-06, + "loss": 0.0024, + "step": 1055 + }, + { + "epoch": 0.9363777432941698, + "grad_norm": 0.20904433727264404, + "learning_rate": 8.047445825449001e-06, + "loss": 0.0009, + "step": 1056 + }, + { + "epoch": 0.9372644646419862, + "grad_norm": 0.293757826089859, + "learning_rate": 8.04364888338714e-06, + "loss": 0.0044, + "step": 1057 + }, + { + "epoch": 0.9381511859898027, + "grad_norm": 0.6664373874664307, + "learning_rate": 8.039849150826721e-06, + "loss": 0.0084, + "step": 1058 + }, + { + "epoch": 0.9390379073376192, + "grad_norm": 0.615222692489624, + "learning_rate": 8.036046631251444e-06, + "loss": 0.0038, + "step": 1059 + }, + { + "epoch": 0.9399246286854356, + "grad_norm": 0.6374018788337708, + "learning_rate": 8.03224132814756e-06, + "loss": 0.0052, + "step": 1060 + }, + { + "epoch": 0.940811350033252, + "grad_norm": 0.6782588362693787, + "learning_rate": 8.028433245003871e-06, + "loss": 0.0059, + "step": 1061 + }, + { + "epoch": 0.9416980713810685, + "grad_norm": 0.3152410686016083, + "learning_rate": 8.02462238531173e-06, + "loss": 0.0016, + "step": 1062 + }, + { + "epoch": 0.942584792728885, + "grad_norm": 0.793630063533783, + "learning_rate": 8.020808752565034e-06, + "loss": 0.009, + "step": 1063 + }, + { + "epoch": 0.9434715140767014, + "grad_norm": 0.3483654260635376, + "learning_rate": 8.016992350260227e-06, + "loss": 0.0029, + "step": 1064 + }, + { + "epoch": 0.9443582354245178, + "grad_norm": 0.8813765048980713, + "learning_rate": 8.013173181896283e-06, + "loss": 0.0039, + "step": 1065 + }, + { + "epoch": 0.9452449567723343, + "grad_norm": 0.2730615437030792, + "learning_rate": 8.009351250974721e-06, + "loss": 0.0032, + "step": 1066 + }, + { + "epoch": 0.9461316781201508, + "grad_norm": 0.3211769461631775, + "learning_rate": 8.00552656099959e-06, + "loss": 0.0022, + "step": 1067 + }, + { + "epoch": 0.9470183994679672, + "grad_norm": 0.5229982733726501, + "learning_rate": 8.001699115477464e-06, + "loss": 0.006, + "step": 1068 + }, + { + "epoch": 0.9479051208157836, + "grad_norm": 0.6488707661628723, + "learning_rate": 7.997868917917453e-06, + "loss": 0.0033, + "step": 1069 + }, + { + "epoch": 0.9487918421636001, + "grad_norm": 0.07874885201454163, + "learning_rate": 7.994035971831178e-06, + "loss": 0.0003, + "step": 1070 + }, + { + "epoch": 0.9496785635114166, + "grad_norm": 0.3380604684352875, + "learning_rate": 7.990200280732792e-06, + "loss": 0.0021, + "step": 1071 + }, + { + "epoch": 0.950565284859233, + "grad_norm": 0.48102006316185, + "learning_rate": 7.986361848138954e-06, + "loss": 0.0038, + "step": 1072 + }, + { + "epoch": 0.9514520062070494, + "grad_norm": 0.8021492958068848, + "learning_rate": 7.982520677568846e-06, + "loss": 0.0049, + "step": 1073 + }, + { + "epoch": 0.9523387275548659, + "grad_norm": 0.5544074177742004, + "learning_rate": 7.978676772544153e-06, + "loss": 0.0081, + "step": 1074 + }, + { + "epoch": 0.9532254489026823, + "grad_norm": 0.6286907196044922, + "learning_rate": 7.97483013658907e-06, + "loss": 0.0104, + "step": 1075 + }, + { + "epoch": 0.9541121702504988, + "grad_norm": 0.636128842830658, + "learning_rate": 7.970980773230296e-06, + "loss": 0.0027, + "step": 1076 + }, + { + "epoch": 0.9549988915983152, + "grad_norm": 0.018983036279678345, + "learning_rate": 7.96712868599703e-06, + "loss": 0.0002, + "step": 1077 + }, + { + "epoch": 0.9558856129461317, + "grad_norm": 0.3683958053588867, + "learning_rate": 7.96327387842097e-06, + "loss": 0.0024, + "step": 1078 + }, + { + "epoch": 0.9567723342939481, + "grad_norm": 0.36459797620773315, + "learning_rate": 7.959416354036303e-06, + "loss": 0.0014, + "step": 1079 + }, + { + "epoch": 0.9576590556417646, + "grad_norm": 0.45369142293930054, + "learning_rate": 7.955556116379715e-06, + "loss": 0.0044, + "step": 1080 + }, + { + "epoch": 0.958545776989581, + "grad_norm": 0.4924830198287964, + "learning_rate": 7.951693168990369e-06, + "loss": 0.0048, + "step": 1081 + }, + { + "epoch": 0.9594324983373975, + "grad_norm": 0.6706141233444214, + "learning_rate": 7.947827515409924e-06, + "loss": 0.0104, + "step": 1082 + }, + { + "epoch": 0.9603192196852139, + "grad_norm": 0.5923113226890564, + "learning_rate": 7.94395915918251e-06, + "loss": 0.0057, + "step": 1083 + }, + { + "epoch": 0.9612059410330304, + "grad_norm": 0.777499794960022, + "learning_rate": 7.940088103854741e-06, + "loss": 0.005, + "step": 1084 + }, + { + "epoch": 0.9620926623808468, + "grad_norm": 0.21997174620628357, + "learning_rate": 7.936214352975706e-06, + "loss": 0.0029, + "step": 1085 + }, + { + "epoch": 0.9629793837286633, + "grad_norm": 0.6433026790618896, + "learning_rate": 7.93233791009696e-06, + "loss": 0.0052, + "step": 1086 + }, + { + "epoch": 0.9638661050764797, + "grad_norm": 0.14001475274562836, + "learning_rate": 7.928458778772533e-06, + "loss": 0.0019, + "step": 1087 + }, + { + "epoch": 0.9647528264242962, + "grad_norm": 0.4639449417591095, + "learning_rate": 7.92457696255891e-06, + "loss": 0.0047, + "step": 1088 + }, + { + "epoch": 0.9656395477721126, + "grad_norm": 0.623169481754303, + "learning_rate": 7.920692465015052e-06, + "loss": 0.0053, + "step": 1089 + }, + { + "epoch": 0.9665262691199291, + "grad_norm": 0.5239494442939758, + "learning_rate": 7.916805289702363e-06, + "loss": 0.011, + "step": 1090 + }, + { + "epoch": 0.9674129904677455, + "grad_norm": 0.3540083169937134, + "learning_rate": 7.912915440184711e-06, + "loss": 0.004, + "step": 1091 + }, + { + "epoch": 0.968299711815562, + "grad_norm": 0.3683808147907257, + "learning_rate": 7.909022920028416e-06, + "loss": 0.0069, + "step": 1092 + }, + { + "epoch": 0.9691864331633784, + "grad_norm": 0.18447741866111755, + "learning_rate": 7.905127732802242e-06, + "loss": 0.0028, + "step": 1093 + }, + { + "epoch": 0.9700731545111948, + "grad_norm": 0.19582514464855194, + "learning_rate": 7.901229882077403e-06, + "loss": 0.0018, + "step": 1094 + }, + { + "epoch": 0.9709598758590113, + "grad_norm": 0.48264968395233154, + "learning_rate": 7.89732937142755e-06, + "loss": 0.0058, + "step": 1095 + }, + { + "epoch": 0.9718465972068278, + "grad_norm": 0.8256710767745972, + "learning_rate": 7.893426204428777e-06, + "loss": 0.0083, + "step": 1096 + }, + { + "epoch": 0.9727333185546442, + "grad_norm": 0.6322904825210571, + "learning_rate": 7.88952038465961e-06, + "loss": 0.0118, + "step": 1097 + }, + { + "epoch": 0.9736200399024606, + "grad_norm": 0.21434572339057922, + "learning_rate": 7.885611915701012e-06, + "loss": 0.0029, + "step": 1098 + }, + { + "epoch": 0.9745067612502771, + "grad_norm": 0.38644617795944214, + "learning_rate": 7.88170080113637e-06, + "loss": 0.0037, + "step": 1099 + }, + { + "epoch": 0.9753934825980936, + "grad_norm": 0.2619270384311676, + "learning_rate": 7.877787044551497e-06, + "loss": 0.0048, + "step": 1100 + }, + { + "epoch": 0.97628020394591, + "grad_norm": 0.5454851984977722, + "learning_rate": 7.873870649534631e-06, + "loss": 0.0052, + "step": 1101 + }, + { + "epoch": 0.9771669252937264, + "grad_norm": 0.46398743987083435, + "learning_rate": 7.869951619676428e-06, + "loss": 0.0067, + "step": 1102 + }, + { + "epoch": 0.9780536466415429, + "grad_norm": 0.2236352264881134, + "learning_rate": 7.866029958569956e-06, + "loss": 0.003, + "step": 1103 + }, + { + "epoch": 0.9789403679893594, + "grad_norm": 0.20091237127780914, + "learning_rate": 7.862105669810703e-06, + "loss": 0.0032, + "step": 1104 + }, + { + "epoch": 0.9798270893371758, + "grad_norm": 0.6972124576568604, + "learning_rate": 7.858178756996557e-06, + "loss": 0.0105, + "step": 1105 + }, + { + "epoch": 0.9807138106849922, + "grad_norm": 0.42576393485069275, + "learning_rate": 7.854249223727823e-06, + "loss": 0.0029, + "step": 1106 + }, + { + "epoch": 0.9816005320328087, + "grad_norm": 0.27041757106781006, + "learning_rate": 7.850317073607193e-06, + "loss": 0.0022, + "step": 1107 + }, + { + "epoch": 0.9824872533806251, + "grad_norm": 0.3191399574279785, + "learning_rate": 7.846382310239777e-06, + "loss": 0.0033, + "step": 1108 + }, + { + "epoch": 0.9833739747284416, + "grad_norm": 0.24137769639492035, + "learning_rate": 7.842444937233066e-06, + "loss": 0.0058, + "step": 1109 + }, + { + "epoch": 0.984260696076258, + "grad_norm": 0.5599633455276489, + "learning_rate": 7.838504958196947e-06, + "loss": 0.0052, + "step": 1110 + }, + { + "epoch": 0.9851474174240745, + "grad_norm": 0.436379611492157, + "learning_rate": 7.834562376743701e-06, + "loss": 0.0088, + "step": 1111 + }, + { + "epoch": 0.986034138771891, + "grad_norm": 0.4467504918575287, + "learning_rate": 7.83061719648799e-06, + "loss": 0.006, + "step": 1112 + }, + { + "epoch": 0.9869208601197074, + "grad_norm": 0.4567229449748993, + "learning_rate": 7.826669421046864e-06, + "loss": 0.0061, + "step": 1113 + }, + { + "epoch": 0.9878075814675238, + "grad_norm": 0.2151392251253128, + "learning_rate": 7.822719054039744e-06, + "loss": 0.0024, + "step": 1114 + }, + { + "epoch": 0.9886943028153403, + "grad_norm": 0.24533632397651672, + "learning_rate": 7.818766099088437e-06, + "loss": 0.005, + "step": 1115 + }, + { + "epoch": 0.9895810241631567, + "grad_norm": 0.1394466906785965, + "learning_rate": 7.814810559817115e-06, + "loss": 0.0012, + "step": 1116 + }, + { + "epoch": 0.9904677455109732, + "grad_norm": 0.20075161755084991, + "learning_rate": 7.810852439852321e-06, + "loss": 0.0014, + "step": 1117 + }, + { + "epoch": 0.9913544668587896, + "grad_norm": 0.030413122847676277, + "learning_rate": 7.806891742822965e-06, + "loss": 0.0002, + "step": 1118 + }, + { + "epoch": 0.9922411882066061, + "grad_norm": 0.2860347628593445, + "learning_rate": 7.802928472360323e-06, + "loss": 0.0024, + "step": 1119 + }, + { + "epoch": 0.9931279095544225, + "grad_norm": 0.0985894724726677, + "learning_rate": 7.798962632098024e-06, + "loss": 0.0008, + "step": 1120 + }, + { + "epoch": 0.994014630902239, + "grad_norm": 0.1537179946899414, + "learning_rate": 7.794994225672057e-06, + "loss": 0.0012, + "step": 1121 + }, + { + "epoch": 0.9949013522500554, + "grad_norm": 0.2673646807670593, + "learning_rate": 7.791023256720765e-06, + "loss": 0.002, + "step": 1122 + }, + { + "epoch": 0.9957880735978719, + "grad_norm": 0.8631387948989868, + "learning_rate": 7.787049728884835e-06, + "loss": 0.0041, + "step": 1123 + }, + { + "epoch": 0.9966747949456883, + "grad_norm": 0.2717569172382355, + "learning_rate": 7.783073645807307e-06, + "loss": 0.0026, + "step": 1124 + }, + { + "epoch": 0.9975615162935048, + "grad_norm": 0.4493410885334015, + "learning_rate": 7.779095011133555e-06, + "loss": 0.002, + "step": 1125 + }, + { + "epoch": 0.9984482376413212, + "grad_norm": 0.20788197219371796, + "learning_rate": 7.775113828511303e-06, + "loss": 0.0008, + "step": 1126 + }, + { + "epoch": 0.9993349589891377, + "grad_norm": 0.1764083057641983, + "learning_rate": 7.771130101590602e-06, + "loss": 0.001, + "step": 1127 + }, + { + "epoch": 1.0008867213478165, + "grad_norm": 0.8892470002174377, + "learning_rate": 7.767143834023842e-06, + "loss": 0.0098, + "step": 1128 + }, + { + "epoch": 1.0008867213478165, + "eval_loss": 0.04110870510339737, + "eval_runtime": 60.9693, + "eval_samples_per_second": 3.149, + "eval_steps_per_second": 0.787, + "step": 1128 + }, + { + "epoch": 1.0017734426956328, + "grad_norm": 1.0256974697113037, + "learning_rate": 7.76315502946574e-06, + "loss": 0.0036, + "step": 1129 + }, + { + "epoch": 1.0026601640434494, + "grad_norm": 0.16955800354480743, + "learning_rate": 7.759163691573332e-06, + "loss": 0.0013, + "step": 1130 + }, + { + "epoch": 1.0035468853912657, + "grad_norm": 0.10564973205327988, + "learning_rate": 7.755169824005989e-06, + "loss": 0.0006, + "step": 1131 + }, + { + "epoch": 1.0044336067390822, + "grad_norm": 0.2520776689052582, + "learning_rate": 7.751173430425393e-06, + "loss": 0.001, + "step": 1132 + }, + { + "epoch": 1.0053203280868988, + "grad_norm": 0.9520653486251831, + "learning_rate": 7.747174514495547e-06, + "loss": 0.0093, + "step": 1133 + }, + { + "epoch": 1.006207049434715, + "grad_norm": 0.41113871335983276, + "learning_rate": 7.743173079882763e-06, + "loss": 0.0018, + "step": 1134 + }, + { + "epoch": 1.0070937707825316, + "grad_norm": 0.10978075861930847, + "learning_rate": 7.73916913025566e-06, + "loss": 0.0005, + "step": 1135 + }, + { + "epoch": 1.0079804921303481, + "grad_norm": 0.3824840188026428, + "learning_rate": 7.73516266928517e-06, + "loss": 0.0017, + "step": 1136 + }, + { + "epoch": 1.0088672134781644, + "grad_norm": 0.1943664401769638, + "learning_rate": 7.73115370064452e-06, + "loss": 0.0008, + "step": 1137 + }, + { + "epoch": 1.009753934825981, + "grad_norm": 0.08970417082309723, + "learning_rate": 7.727142228009244e-06, + "loss": 0.0003, + "step": 1138 + }, + { + "epoch": 1.0106406561737973, + "grad_norm": 0.0703229010105133, + "learning_rate": 7.723128255057162e-06, + "loss": 0.0005, + "step": 1139 + }, + { + "epoch": 1.0115273775216138, + "grad_norm": 0.28951114416122437, + "learning_rate": 7.719111785468394e-06, + "loss": 0.0008, + "step": 1140 + }, + { + "epoch": 1.0124140988694303, + "grad_norm": 0.13246096670627594, + "learning_rate": 7.715092822925346e-06, + "loss": 0.0008, + "step": 1141 + }, + { + "epoch": 1.0133008202172467, + "grad_norm": 1.2307990789413452, + "learning_rate": 7.71107137111271e-06, + "loss": 0.0081, + "step": 1142 + }, + { + "epoch": 1.0141875415650632, + "grad_norm": 0.3421829640865326, + "learning_rate": 7.707047433717464e-06, + "loss": 0.0031, + "step": 1143 + }, + { + "epoch": 1.0150742629128797, + "grad_norm": 0.6147262454032898, + "learning_rate": 7.703021014428855e-06, + "loss": 0.0048, + "step": 1144 + }, + { + "epoch": 1.015960984260696, + "grad_norm": 0.38976818323135376, + "learning_rate": 7.698992116938418e-06, + "loss": 0.0036, + "step": 1145 + }, + { + "epoch": 1.0168477056085126, + "grad_norm": 0.6530481576919556, + "learning_rate": 7.694960744939948e-06, + "loss": 0.003, + "step": 1146 + }, + { + "epoch": 1.0177344269563289, + "grad_norm": 0.4899827539920807, + "learning_rate": 7.690926902129519e-06, + "loss": 0.0024, + "step": 1147 + }, + { + "epoch": 1.0186211483041454, + "grad_norm": 1.1396379470825195, + "learning_rate": 7.686890592205462e-06, + "loss": 0.0008, + "step": 1148 + }, + { + "epoch": 1.019507869651962, + "grad_norm": 0.3091973662376404, + "learning_rate": 7.682851818868377e-06, + "loss": 0.0059, + "step": 1149 + }, + { + "epoch": 1.0203945909997783, + "grad_norm": 0.3940574526786804, + "learning_rate": 7.678810585821118e-06, + "loss": 0.0028, + "step": 1150 + }, + { + "epoch": 1.0212813123475948, + "grad_norm": 1.2961633205413818, + "learning_rate": 7.674766896768794e-06, + "loss": 0.0024, + "step": 1151 + }, + { + "epoch": 1.0221680336954113, + "grad_norm": 0.48337897658348083, + "learning_rate": 7.67072075541877e-06, + "loss": 0.0047, + "step": 1152 + }, + { + "epoch": 1.0230547550432276, + "grad_norm": 0.578425407409668, + "learning_rate": 7.666672165480653e-06, + "loss": 0.0042, + "step": 1153 + }, + { + "epoch": 1.0239414763910442, + "grad_norm": 0.25309690833091736, + "learning_rate": 7.6626211306663e-06, + "loss": 0.0009, + "step": 1154 + }, + { + "epoch": 1.0248281977388605, + "grad_norm": 1.7819604873657227, + "learning_rate": 7.65856765468981e-06, + "loss": 0.0062, + "step": 1155 + }, + { + "epoch": 1.025714919086677, + "grad_norm": 0.3901142477989197, + "learning_rate": 7.654511741267514e-06, + "loss": 0.0042, + "step": 1156 + }, + { + "epoch": 1.0266016404344935, + "grad_norm": 0.10478702187538147, + "learning_rate": 7.650453394117983e-06, + "loss": 0.0006, + "step": 1157 + }, + { + "epoch": 1.0274883617823098, + "grad_norm": 0.10292093455791473, + "learning_rate": 7.646392616962019e-06, + "loss": 0.0006, + "step": 1158 + }, + { + "epoch": 1.0283750831301264, + "grad_norm": 0.02415108121931553, + "learning_rate": 7.642329413522653e-06, + "loss": 0.0002, + "step": 1159 + }, + { + "epoch": 1.029261804477943, + "grad_norm": 0.3483298122882843, + "learning_rate": 7.638263787525134e-06, + "loss": 0.0028, + "step": 1160 + }, + { + "epoch": 1.0301485258257592, + "grad_norm": 0.5602533221244812, + "learning_rate": 7.63419574269694e-06, + "loss": 0.0067, + "step": 1161 + }, + { + "epoch": 1.0310352471735758, + "grad_norm": 0.5920149087905884, + "learning_rate": 7.63012528276776e-06, + "loss": 0.0074, + "step": 1162 + }, + { + "epoch": 1.031921968521392, + "grad_norm": 0.24287919700145721, + "learning_rate": 7.6260524114695e-06, + "loss": 0.0017, + "step": 1163 + }, + { + "epoch": 1.0328086898692086, + "grad_norm": 3.347308397293091, + "learning_rate": 7.6219771325362825e-06, + "loss": 0.0081, + "step": 1164 + }, + { + "epoch": 1.0336954112170251, + "grad_norm": 0.49304619431495667, + "learning_rate": 7.617899449704427e-06, + "loss": 0.0052, + "step": 1165 + }, + { + "epoch": 1.0345821325648414, + "grad_norm": 0.29841598868370056, + "learning_rate": 7.613819366712464e-06, + "loss": 0.0019, + "step": 1166 + }, + { + "epoch": 1.035468853912658, + "grad_norm": 0.08790603280067444, + "learning_rate": 7.60973688730112e-06, + "loss": 0.0007, + "step": 1167 + }, + { + "epoch": 1.0363555752604743, + "grad_norm": 0.1790074110031128, + "learning_rate": 7.605652015213326e-06, + "loss": 0.0018, + "step": 1168 + }, + { + "epoch": 1.0372422966082908, + "grad_norm": 0.1106707751750946, + "learning_rate": 7.601564754194198e-06, + "loss": 0.0008, + "step": 1169 + }, + { + "epoch": 1.0381290179561073, + "grad_norm": 0.31249484419822693, + "learning_rate": 7.597475107991046e-06, + "loss": 0.004, + "step": 1170 + }, + { + "epoch": 1.0390157393039237, + "grad_norm": 0.1908506155014038, + "learning_rate": 7.593383080353369e-06, + "loss": 0.0013, + "step": 1171 + }, + { + "epoch": 1.0399024606517402, + "grad_norm": 0.7766817212104797, + "learning_rate": 7.5892886750328446e-06, + "loss": 0.007, + "step": 1172 + }, + { + "epoch": 1.0407891819995567, + "grad_norm": 0.8299360871315002, + "learning_rate": 7.585191895783335e-06, + "loss": 0.0056, + "step": 1173 + }, + { + "epoch": 1.041675903347373, + "grad_norm": 0.23749642074108124, + "learning_rate": 7.5810927463608766e-06, + "loss": 0.0017, + "step": 1174 + }, + { + "epoch": 1.0425626246951896, + "grad_norm": 0.10918860882520676, + "learning_rate": 7.576991230523678e-06, + "loss": 0.0014, + "step": 1175 + }, + { + "epoch": 1.043449346043006, + "grad_norm": 0.030476948246359825, + "learning_rate": 7.572887352032119e-06, + "loss": 0.0002, + "step": 1176 + }, + { + "epoch": 1.0443360673908224, + "grad_norm": 0.2967153787612915, + "learning_rate": 7.568781114648744e-06, + "loss": 0.0025, + "step": 1177 + }, + { + "epoch": 1.045222788738639, + "grad_norm": 0.2687462270259857, + "learning_rate": 7.5646725221382635e-06, + "loss": 0.0015, + "step": 1178 + }, + { + "epoch": 1.0461095100864553, + "grad_norm": 0.2979382276535034, + "learning_rate": 7.560561578267542e-06, + "loss": 0.0035, + "step": 1179 + }, + { + "epoch": 1.0469962314342718, + "grad_norm": 0.18216152489185333, + "learning_rate": 7.556448286805605e-06, + "loss": 0.0011, + "step": 1180 + }, + { + "epoch": 1.0478829527820883, + "grad_norm": 0.26704421639442444, + "learning_rate": 7.552332651523626e-06, + "loss": 0.0021, + "step": 1181 + }, + { + "epoch": 1.0487696741299046, + "grad_norm": 0.37852510809898376, + "learning_rate": 7.54821467619493e-06, + "loss": 0.0052, + "step": 1182 + }, + { + "epoch": 1.0496563954777212, + "grad_norm": 0.02862503007054329, + "learning_rate": 7.5440943645949845e-06, + "loss": 0.0002, + "step": 1183 + }, + { + "epoch": 1.0505431168255375, + "grad_norm": 0.4329982101917267, + "learning_rate": 7.539971720501407e-06, + "loss": 0.0032, + "step": 1184 + }, + { + "epoch": 1.051429838173354, + "grad_norm": 0.05105764791369438, + "learning_rate": 7.535846747693944e-06, + "loss": 0.0003, + "step": 1185 + }, + { + "epoch": 1.0523165595211705, + "grad_norm": 0.22195594012737274, + "learning_rate": 7.531719449954479e-06, + "loss": 0.0016, + "step": 1186 + }, + { + "epoch": 1.0532032808689868, + "grad_norm": 0.49873724579811096, + "learning_rate": 7.527589831067032e-06, + "loss": 0.01, + "step": 1187 + }, + { + "epoch": 1.0540900022168034, + "grad_norm": 0.09682756662368774, + "learning_rate": 7.523457894817745e-06, + "loss": 0.0004, + "step": 1188 + }, + { + "epoch": 1.05497672356462, + "grad_norm": 0.06749970465898514, + "learning_rate": 7.519323644994892e-06, + "loss": 0.0004, + "step": 1189 + }, + { + "epoch": 1.0558634449124362, + "grad_norm": 0.06560672074556351, + "learning_rate": 7.515187085388858e-06, + "loss": 0.0006, + "step": 1190 + }, + { + "epoch": 1.0567501662602528, + "grad_norm": 4.2813286781311035, + "learning_rate": 7.511048219792154e-06, + "loss": 0.0045, + "step": 1191 + }, + { + "epoch": 1.057636887608069, + "grad_norm": 0.16461029648780823, + "learning_rate": 7.506907051999402e-06, + "loss": 0.0009, + "step": 1192 + }, + { + "epoch": 1.0585236089558856, + "grad_norm": 0.5340306758880615, + "learning_rate": 7.502763585807336e-06, + "loss": 0.0053, + "step": 1193 + }, + { + "epoch": 1.0594103303037021, + "grad_norm": 0.6085299253463745, + "learning_rate": 7.498617825014795e-06, + "loss": 0.004, + "step": 1194 + }, + { + "epoch": 1.0602970516515184, + "grad_norm": 0.5654335021972656, + "learning_rate": 7.494469773422726e-06, + "loss": 0.0029, + "step": 1195 + }, + { + "epoch": 1.061183772999335, + "grad_norm": 0.13969686627388, + "learning_rate": 7.49031943483417e-06, + "loss": 0.0008, + "step": 1196 + }, + { + "epoch": 1.0620704943471515, + "grad_norm": 0.521077573299408, + "learning_rate": 7.48616681305427e-06, + "loss": 0.0032, + "step": 1197 + }, + { + "epoch": 1.0629572156949678, + "grad_norm": 0.5144971013069153, + "learning_rate": 7.4820119118902615e-06, + "loss": 0.0085, + "step": 1198 + }, + { + "epoch": 1.0638439370427843, + "grad_norm": 0.04687139019370079, + "learning_rate": 7.477854735151466e-06, + "loss": 0.0003, + "step": 1199 + }, + { + "epoch": 1.0647306583906007, + "grad_norm": 5.557599067687988, + "learning_rate": 7.473695286649296e-06, + "loss": 0.0046, + "step": 1200 + }, + { + "epoch": 1.0656173797384172, + "grad_norm": 0.12024810910224915, + "learning_rate": 7.469533570197245e-06, + "loss": 0.0006, + "step": 1201 + }, + { + "epoch": 1.0665041010862337, + "grad_norm": 0.14419999718666077, + "learning_rate": 7.4653695896108855e-06, + "loss": 0.0011, + "step": 1202 + }, + { + "epoch": 1.06739082243405, + "grad_norm": 0.2992253303527832, + "learning_rate": 7.461203348707866e-06, + "loss": 0.0027, + "step": 1203 + }, + { + "epoch": 1.0682775437818666, + "grad_norm": 0.058901526033878326, + "learning_rate": 7.457034851307907e-06, + "loss": 0.0007, + "step": 1204 + }, + { + "epoch": 1.069164265129683, + "grad_norm": 0.1338261067867279, + "learning_rate": 7.452864101232798e-06, + "loss": 0.0016, + "step": 1205 + }, + { + "epoch": 1.0700509864774994, + "grad_norm": 0.18810372054576874, + "learning_rate": 7.448691102306396e-06, + "loss": 0.0011, + "step": 1206 + }, + { + "epoch": 1.070937707825316, + "grad_norm": 0.5619723200798035, + "learning_rate": 7.444515858354615e-06, + "loss": 0.0016, + "step": 1207 + }, + { + "epoch": 1.0718244291731323, + "grad_norm": 0.10859323292970657, + "learning_rate": 7.4403383732054325e-06, + "loss": 0.0012, + "step": 1208 + }, + { + "epoch": 1.0727111505209488, + "grad_norm": 0.16468924283981323, + "learning_rate": 7.436158650688877e-06, + "loss": 0.0009, + "step": 1209 + }, + { + "epoch": 1.0735978718687653, + "grad_norm": 0.2525959312915802, + "learning_rate": 7.431976694637031e-06, + "loss": 0.0022, + "step": 1210 + }, + { + "epoch": 1.0744845932165816, + "grad_norm": 0.2525767385959625, + "learning_rate": 7.427792508884022e-06, + "loss": 0.0013, + "step": 1211 + }, + { + "epoch": 1.0753713145643982, + "grad_norm": 0.330912709236145, + "learning_rate": 7.423606097266024e-06, + "loss": 0.0039, + "step": 1212 + }, + { + "epoch": 1.0762580359122147, + "grad_norm": 0.36091116070747375, + "learning_rate": 7.41941746362125e-06, + "loss": 0.0018, + "step": 1213 + }, + { + "epoch": 1.077144757260031, + "grad_norm": 0.4603426456451416, + "learning_rate": 7.415226611789952e-06, + "loss": 0.0027, + "step": 1214 + }, + { + "epoch": 1.0780314786078475, + "grad_norm": 0.04140254855155945, + "learning_rate": 7.4110335456144156e-06, + "loss": 0.0001, + "step": 1215 + }, + { + "epoch": 1.0789181999556638, + "grad_norm": 0.13625720143318176, + "learning_rate": 7.406838268938954e-06, + "loss": 0.0008, + "step": 1216 + }, + { + "epoch": 1.0798049213034804, + "grad_norm": 0.23671278357505798, + "learning_rate": 7.402640785609911e-06, + "loss": 0.0017, + "step": 1217 + }, + { + "epoch": 1.080691642651297, + "grad_norm": 0.15733811259269714, + "learning_rate": 7.398441099475649e-06, + "loss": 0.001, + "step": 1218 + }, + { + "epoch": 1.0815783639991132, + "grad_norm": 1.2790669202804565, + "learning_rate": 7.394239214386555e-06, + "loss": 0.0119, + "step": 1219 + }, + { + "epoch": 1.0824650853469298, + "grad_norm": 1.2404475212097168, + "learning_rate": 7.390035134195027e-06, + "loss": 0.0009, + "step": 1220 + }, + { + "epoch": 1.083351806694746, + "grad_norm": 0.176893413066864, + "learning_rate": 7.385828862755478e-06, + "loss": 0.0012, + "step": 1221 + }, + { + "epoch": 1.0842385280425626, + "grad_norm": 0.244059219956398, + "learning_rate": 7.381620403924333e-06, + "loss": 0.0067, + "step": 1222 + }, + { + "epoch": 1.0851252493903791, + "grad_norm": 0.16676774621009827, + "learning_rate": 7.377409761560016e-06, + "loss": 0.0005, + "step": 1223 + }, + { + "epoch": 1.0860119707381954, + "grad_norm": 0.7514981627464294, + "learning_rate": 7.373196939522959e-06, + "loss": 0.0057, + "step": 1224 + }, + { + "epoch": 1.086898692086012, + "grad_norm": 0.6947178840637207, + "learning_rate": 7.368981941675587e-06, + "loss": 0.0031, + "step": 1225 + }, + { + "epoch": 1.0877854134338285, + "grad_norm": 0.024032553657889366, + "learning_rate": 7.364764771882324e-06, + "loss": 0.0001, + "step": 1226 + }, + { + "epoch": 1.0886721347816448, + "grad_norm": 0.17391324043273926, + "learning_rate": 7.360545434009585e-06, + "loss": 0.0006, + "step": 1227 + }, + { + "epoch": 1.0895588561294614, + "grad_norm": 0.5214890837669373, + "learning_rate": 7.356323931925769e-06, + "loss": 0.0057, + "step": 1228 + }, + { + "epoch": 1.0904455774772779, + "grad_norm": 0.3625035583972931, + "learning_rate": 7.352100269501264e-06, + "loss": 0.0026, + "step": 1229 + }, + { + "epoch": 1.0913322988250942, + "grad_norm": 0.3312781751155853, + "learning_rate": 7.3478744506084355e-06, + "loss": 0.0045, + "step": 1230 + }, + { + "epoch": 1.0922190201729107, + "grad_norm": 0.26414018869400024, + "learning_rate": 7.343646479121626e-06, + "loss": 0.0039, + "step": 1231 + }, + { + "epoch": 1.093105741520727, + "grad_norm": 0.09360918402671814, + "learning_rate": 7.339416358917156e-06, + "loss": 0.0005, + "step": 1232 + }, + { + "epoch": 1.0939924628685436, + "grad_norm": 0.014107626862823963, + "learning_rate": 7.33518409387331e-06, + "loss": 0.0001, + "step": 1233 + }, + { + "epoch": 1.09487918421636, + "grad_norm": 0.2544485330581665, + "learning_rate": 7.3309496878703415e-06, + "loss": 0.0056, + "step": 1234 + }, + { + "epoch": 1.0957659055641764, + "grad_norm": 0.6181140542030334, + "learning_rate": 7.326713144790467e-06, + "loss": 0.001, + "step": 1235 + }, + { + "epoch": 1.096652626911993, + "grad_norm": 0.27269816398620605, + "learning_rate": 7.322474468517865e-06, + "loss": 0.0029, + "step": 1236 + }, + { + "epoch": 1.0975393482598093, + "grad_norm": 0.44560036063194275, + "learning_rate": 7.318233662938664e-06, + "loss": 0.0009, + "step": 1237 + }, + { + "epoch": 1.0984260696076258, + "grad_norm": 0.22260509431362152, + "learning_rate": 7.3139907319409496e-06, + "loss": 0.0012, + "step": 1238 + }, + { + "epoch": 1.0993127909554423, + "grad_norm": 0.2584804594516754, + "learning_rate": 7.309745679414751e-06, + "loss": 0.0014, + "step": 1239 + }, + { + "epoch": 1.1001995123032586, + "grad_norm": 0.2714986205101013, + "learning_rate": 7.305498509252052e-06, + "loss": 0.0044, + "step": 1240 + }, + { + "epoch": 1.1010862336510752, + "grad_norm": 0.6800200343132019, + "learning_rate": 7.301249225346764e-06, + "loss": 0.0045, + "step": 1241 + }, + { + "epoch": 1.1019729549988917, + "grad_norm": 0.061328668147325516, + "learning_rate": 7.2969978315947485e-06, + "loss": 0.0004, + "step": 1242 + }, + { + "epoch": 1.102859676346708, + "grad_norm": 0.23961175978183746, + "learning_rate": 7.292744331893796e-06, + "loss": 0.0009, + "step": 1243 + }, + { + "epoch": 1.1037463976945245, + "grad_norm": 0.36839234828948975, + "learning_rate": 7.288488730143629e-06, + "loss": 0.0025, + "step": 1244 + }, + { + "epoch": 1.1046331190423408, + "grad_norm": 0.47886547446250916, + "learning_rate": 7.284231030245896e-06, + "loss": 0.0051, + "step": 1245 + }, + { + "epoch": 1.1055198403901574, + "grad_norm": 0.4376900792121887, + "learning_rate": 7.279971236104171e-06, + "loss": 0.0064, + "step": 1246 + }, + { + "epoch": 1.106406561737974, + "grad_norm": 0.09138593822717667, + "learning_rate": 7.275709351623945e-06, + "loss": 0.0007, + "step": 1247 + }, + { + "epoch": 1.1072932830857902, + "grad_norm": 0.40806150436401367, + "learning_rate": 7.271445380712629e-06, + "loss": 0.0043, + "step": 1248 + }, + { + "epoch": 1.1081800044336068, + "grad_norm": 0.11947987228631973, + "learning_rate": 7.267179327279545e-06, + "loss": 0.001, + "step": 1249 + }, + { + "epoch": 1.1090667257814233, + "grad_norm": 0.11575794219970703, + "learning_rate": 7.262911195235925e-06, + "loss": 0.0009, + "step": 1250 + }, + { + "epoch": 1.1099534471292396, + "grad_norm": 0.1301448494195938, + "learning_rate": 7.258640988494906e-06, + "loss": 0.0011, + "step": 1251 + }, + { + "epoch": 1.1108401684770561, + "grad_norm": 0.308881938457489, + "learning_rate": 7.254368710971529e-06, + "loss": 0.0018, + "step": 1252 + }, + { + "epoch": 1.1117268898248724, + "grad_norm": 0.5369164943695068, + "learning_rate": 7.2500943665827285e-06, + "loss": 0.0033, + "step": 1253 + }, + { + "epoch": 1.112613611172689, + "grad_norm": 0.0686551108956337, + "learning_rate": 7.245817959247341e-06, + "loss": 0.0005, + "step": 1254 + }, + { + "epoch": 1.1135003325205055, + "grad_norm": 0.33711907267570496, + "learning_rate": 7.241539492886089e-06, + "loss": 0.0018, + "step": 1255 + }, + { + "epoch": 1.1143870538683218, + "grad_norm": 0.3586072325706482, + "learning_rate": 7.237258971421587e-06, + "loss": 0.0061, + "step": 1256 + }, + { + "epoch": 1.1152737752161384, + "grad_norm": 0.036523330956697464, + "learning_rate": 7.232976398778329e-06, + "loss": 0.0002, + "step": 1257 + }, + { + "epoch": 1.1161604965639547, + "grad_norm": 0.23500844836235046, + "learning_rate": 7.2286917788826926e-06, + "loss": 0.0016, + "step": 1258 + }, + { + "epoch": 1.1170472179117712, + "grad_norm": 0.3606179654598236, + "learning_rate": 7.224405115662931e-06, + "loss": 0.002, + "step": 1259 + }, + { + "epoch": 1.1179339392595877, + "grad_norm": 0.14782433211803436, + "learning_rate": 7.220116413049173e-06, + "loss": 0.0014, + "step": 1260 + }, + { + "epoch": 1.118820660607404, + "grad_norm": 0.4738283157348633, + "learning_rate": 7.2158256749734155e-06, + "loss": 0.0046, + "step": 1261 + }, + { + "epoch": 1.1197073819552206, + "grad_norm": 0.4056955873966217, + "learning_rate": 7.21153290536952e-06, + "loss": 0.0034, + "step": 1262 + }, + { + "epoch": 1.120594103303037, + "grad_norm": 0.08267973363399506, + "learning_rate": 7.207238108173216e-06, + "loss": 0.0004, + "step": 1263 + }, + { + "epoch": 1.1214808246508534, + "grad_norm": 0.31053709983825684, + "learning_rate": 7.202941287322084e-06, + "loss": 0.0046, + "step": 1264 + }, + { + "epoch": 1.12236754599867, + "grad_norm": 0.2635895609855652, + "learning_rate": 7.198642446755566e-06, + "loss": 0.0019, + "step": 1265 + }, + { + "epoch": 1.1232542673464865, + "grad_norm": 0.5915330648422241, + "learning_rate": 7.194341590414954e-06, + "loss": 0.0065, + "step": 1266 + }, + { + "epoch": 1.1241409886943028, + "grad_norm": 0.2291322946548462, + "learning_rate": 7.190038722243387e-06, + "loss": 0.0015, + "step": 1267 + }, + { + "epoch": 1.1250277100421193, + "grad_norm": 0.21318402886390686, + "learning_rate": 7.18573384618585e-06, + "loss": 0.0009, + "step": 1268 + }, + { + "epoch": 1.1259144313899356, + "grad_norm": 0.03609558939933777, + "learning_rate": 7.181426966189168e-06, + "loss": 0.0003, + "step": 1269 + }, + { + "epoch": 1.1268011527377522, + "grad_norm": 0.9004939198493958, + "learning_rate": 7.177118086202004e-06, + "loss": 0.0111, + "step": 1270 + }, + { + "epoch": 1.1276878740855687, + "grad_norm": 0.3457188010215759, + "learning_rate": 7.1728072101748546e-06, + "loss": 0.0012, + "step": 1271 + }, + { + "epoch": 1.128574595433385, + "grad_norm": 0.4781457185745239, + "learning_rate": 7.168494342060044e-06, + "loss": 0.0096, + "step": 1272 + }, + { + "epoch": 1.1294613167812015, + "grad_norm": 0.8124074935913086, + "learning_rate": 7.164179485811728e-06, + "loss": 0.0019, + "step": 1273 + }, + { + "epoch": 1.1303480381290179, + "grad_norm": 0.3004269599914551, + "learning_rate": 7.159862645385879e-06, + "loss": 0.0015, + "step": 1274 + }, + { + "epoch": 1.1312347594768344, + "grad_norm": 0.5909804105758667, + "learning_rate": 7.155543824740294e-06, + "loss": 0.0059, + "step": 1275 + }, + { + "epoch": 1.132121480824651, + "grad_norm": 0.09179907292127609, + "learning_rate": 7.151223027834581e-06, + "loss": 0.0006, + "step": 1276 + }, + { + "epoch": 1.1330082021724672, + "grad_norm": 0.11039888113737106, + "learning_rate": 7.146900258630163e-06, + "loss": 0.0005, + "step": 1277 + }, + { + "epoch": 1.1338949235202838, + "grad_norm": 0.10031022131443024, + "learning_rate": 7.1425755210902705e-06, + "loss": 0.0006, + "step": 1278 + }, + { + "epoch": 1.1347816448681003, + "grad_norm": 0.5158107876777649, + "learning_rate": 7.138248819179937e-06, + "loss": 0.0045, + "step": 1279 + }, + { + "epoch": 1.1356683662159166, + "grad_norm": 0.2349659502506256, + "learning_rate": 7.133920156866001e-06, + "loss": 0.0031, + "step": 1280 + }, + { + "epoch": 1.1365550875637331, + "grad_norm": 0.17994800209999084, + "learning_rate": 7.129589538117092e-06, + "loss": 0.001, + "step": 1281 + }, + { + "epoch": 1.1374418089115497, + "grad_norm": 0.3246804475784302, + "learning_rate": 7.125256966903639e-06, + "loss": 0.0019, + "step": 1282 + }, + { + "epoch": 1.138328530259366, + "grad_norm": 0.47080448269844055, + "learning_rate": 7.12092244719786e-06, + "loss": 0.0053, + "step": 1283 + }, + { + "epoch": 1.1392152516071825, + "grad_norm": 0.10333869606256485, + "learning_rate": 7.116585982973756e-06, + "loss": 0.0006, + "step": 1284 + }, + { + "epoch": 1.1401019729549988, + "grad_norm": 0.34819576144218445, + "learning_rate": 7.112247578207115e-06, + "loss": 0.0034, + "step": 1285 + }, + { + "epoch": 1.1409886943028154, + "grad_norm": 0.3301628828048706, + "learning_rate": 7.107907236875501e-06, + "loss": 0.0016, + "step": 1286 + }, + { + "epoch": 1.1418754156506319, + "grad_norm": 0.42407017946243286, + "learning_rate": 7.103564962958256e-06, + "loss": 0.0078, + "step": 1287 + }, + { + "epoch": 1.1427621369984482, + "grad_norm": 0.592100977897644, + "learning_rate": 7.099220760436492e-06, + "loss": 0.0069, + "step": 1288 + }, + { + "epoch": 1.1436488583462647, + "grad_norm": 0.6017158627510071, + "learning_rate": 7.094874633293091e-06, + "loss": 0.0044, + "step": 1289 + }, + { + "epoch": 1.144535579694081, + "grad_norm": 0.025082416832447052, + "learning_rate": 7.090526585512696e-06, + "loss": 0.0003, + "step": 1290 + }, + { + "epoch": 1.1454223010418976, + "grad_norm": 0.1530878245830536, + "learning_rate": 7.086176621081715e-06, + "loss": 0.0012, + "step": 1291 + }, + { + "epoch": 1.146309022389714, + "grad_norm": 0.09143998473882675, + "learning_rate": 7.0818247439883115e-06, + "loss": 0.0007, + "step": 1292 + }, + { + "epoch": 1.1471957437375304, + "grad_norm": 0.11759691685438156, + "learning_rate": 7.077470958222402e-06, + "loss": 0.0005, + "step": 1293 + }, + { + "epoch": 1.148082465085347, + "grad_norm": 0.6441949605941772, + "learning_rate": 7.073115267775654e-06, + "loss": 0.0047, + "step": 1294 + }, + { + "epoch": 1.1489691864331633, + "grad_norm": 0.01792193576693535, + "learning_rate": 7.06875767664148e-06, + "loss": 0.0001, + "step": 1295 + }, + { + "epoch": 1.1498559077809798, + "grad_norm": 0.26153331995010376, + "learning_rate": 7.064398188815038e-06, + "loss": 0.0017, + "step": 1296 + }, + { + "epoch": 1.1507426291287963, + "grad_norm": 0.48967376351356506, + "learning_rate": 7.060036808293221e-06, + "loss": 0.0045, + "step": 1297 + }, + { + "epoch": 1.1516293504766126, + "grad_norm": 0.11976376920938492, + "learning_rate": 7.055673539074657e-06, + "loss": 0.0009, + "step": 1298 + }, + { + "epoch": 1.1525160718244292, + "grad_norm": 0.13650448620319366, + "learning_rate": 7.051308385159712e-06, + "loss": 0.0008, + "step": 1299 + }, + { + "epoch": 1.1534027931722457, + "grad_norm": 0.1300816833972931, + "learning_rate": 7.0469413505504745e-06, + "loss": 0.001, + "step": 1300 + }, + { + "epoch": 1.154289514520062, + "grad_norm": 0.1258881539106369, + "learning_rate": 7.042572439250756e-06, + "loss": 0.0005, + "step": 1301 + }, + { + "epoch": 1.1551762358678785, + "grad_norm": 0.0720561072230339, + "learning_rate": 7.038201655266093e-06, + "loss": 0.0004, + "step": 1302 + }, + { + "epoch": 1.156062957215695, + "grad_norm": 0.08578696846961975, + "learning_rate": 7.033829002603738e-06, + "loss": 0.0004, + "step": 1303 + }, + { + "epoch": 1.1569496785635114, + "grad_norm": 0.2591750919818878, + "learning_rate": 7.029454485272654e-06, + "loss": 0.0014, + "step": 1304 + }, + { + "epoch": 1.157836399911328, + "grad_norm": 0.13878989219665527, + "learning_rate": 7.0250781072835165e-06, + "loss": 0.0009, + "step": 1305 + }, + { + "epoch": 1.1587231212591442, + "grad_norm": 0.13020068407058716, + "learning_rate": 7.020699872648703e-06, + "loss": 0.0005, + "step": 1306 + }, + { + "epoch": 1.1596098426069608, + "grad_norm": 0.0145335141569376, + "learning_rate": 7.0163197853822975e-06, + "loss": 0.0001, + "step": 1307 + }, + { + "epoch": 1.1604965639547773, + "grad_norm": 0.04610709846019745, + "learning_rate": 7.0119378495000824e-06, + "loss": 0.0002, + "step": 1308 + }, + { + "epoch": 1.1613832853025936, + "grad_norm": 0.027545208111405373, + "learning_rate": 7.007554069019532e-06, + "loss": 0.0002, + "step": 1309 + }, + { + "epoch": 1.1622700066504101, + "grad_norm": 0.8998842239379883, + "learning_rate": 7.003168447959814e-06, + "loss": 0.0093, + "step": 1310 + }, + { + "epoch": 1.1631567279982264, + "grad_norm": 0.5461855530738831, + "learning_rate": 6.99878099034178e-06, + "loss": 0.0028, + "step": 1311 + }, + { + "epoch": 1.164043449346043, + "grad_norm": 0.3846938908100128, + "learning_rate": 6.994391700187975e-06, + "loss": 0.0042, + "step": 1312 + }, + { + "epoch": 1.1649301706938595, + "grad_norm": 0.08041378855705261, + "learning_rate": 6.9900005815226095e-06, + "loss": 0.0004, + "step": 1313 + }, + { + "epoch": 1.1658168920416758, + "grad_norm": 0.2825201153755188, + "learning_rate": 6.9856076383715845e-06, + "loss": 0.0043, + "step": 1314 + }, + { + "epoch": 1.1667036133894924, + "grad_norm": 0.22830918431282043, + "learning_rate": 6.981212874762463e-06, + "loss": 0.003, + "step": 1315 + }, + { + "epoch": 1.1675903347373089, + "grad_norm": 0.37838220596313477, + "learning_rate": 6.976816294724484e-06, + "loss": 0.0045, + "step": 1316 + }, + { + "epoch": 1.1684770560851252, + "grad_norm": 0.2836399972438812, + "learning_rate": 6.9724179022885505e-06, + "loss": 0.0018, + "step": 1317 + }, + { + "epoch": 1.1693637774329417, + "grad_norm": 0.39304614067077637, + "learning_rate": 6.968017701487223e-06, + "loss": 0.0048, + "step": 1318 + }, + { + "epoch": 1.1702504987807583, + "grad_norm": 0.14998720586299896, + "learning_rate": 6.963615696354726e-06, + "loss": 0.0008, + "step": 1319 + }, + { + "epoch": 1.1711372201285746, + "grad_norm": 0.10552586615085602, + "learning_rate": 6.9592118909269335e-06, + "loss": 0.0005, + "step": 1320 + }, + { + "epoch": 1.172023941476391, + "grad_norm": 0.06915909796953201, + "learning_rate": 6.954806289241374e-06, + "loss": 0.0004, + "step": 1321 + }, + { + "epoch": 1.1729106628242074, + "grad_norm": 0.18020567297935486, + "learning_rate": 6.950398895337216e-06, + "loss": 0.0011, + "step": 1322 + }, + { + "epoch": 1.173797384172024, + "grad_norm": 0.10368692129850388, + "learning_rate": 6.945989713255281e-06, + "loss": 0.0004, + "step": 1323 + }, + { + "epoch": 1.1746841055198405, + "grad_norm": 0.9406795501708984, + "learning_rate": 6.941578747038024e-06, + "loss": 0.0038, + "step": 1324 + }, + { + "epoch": 1.1755708268676568, + "grad_norm": 0.23332077264785767, + "learning_rate": 6.937166000729534e-06, + "loss": 0.0024, + "step": 1325 + }, + { + "epoch": 1.1764575482154733, + "grad_norm": 0.0631319135427475, + "learning_rate": 6.932751478375537e-06, + "loss": 0.0005, + "step": 1326 + }, + { + "epoch": 1.1773442695632896, + "grad_norm": 0.07518002390861511, + "learning_rate": 6.928335184023384e-06, + "loss": 0.0003, + "step": 1327 + }, + { + "epoch": 1.1782309909111062, + "grad_norm": 0.30457931756973267, + "learning_rate": 6.923917121722051e-06, + "loss": 0.0024, + "step": 1328 + }, + { + "epoch": 1.1791177122589227, + "grad_norm": 0.10226471722126007, + "learning_rate": 6.919497295522137e-06, + "loss": 0.0006, + "step": 1329 + }, + { + "epoch": 1.180004433606739, + "grad_norm": 0.11103340983390808, + "learning_rate": 6.9150757094758554e-06, + "loss": 0.0007, + "step": 1330 + }, + { + "epoch": 1.1808911549545555, + "grad_norm": 0.08289068937301636, + "learning_rate": 6.910652367637035e-06, + "loss": 0.0005, + "step": 1331 + }, + { + "epoch": 1.181777876302372, + "grad_norm": 0.24665898084640503, + "learning_rate": 6.906227274061114e-06, + "loss": 0.0012, + "step": 1332 + }, + { + "epoch": 1.1826645976501884, + "grad_norm": 0.3390233516693115, + "learning_rate": 6.901800432805135e-06, + "loss": 0.0034, + "step": 1333 + }, + { + "epoch": 1.183551318998005, + "grad_norm": 0.3979106843471527, + "learning_rate": 6.897371847927745e-06, + "loss": 0.0036, + "step": 1334 + }, + { + "epoch": 1.1844380403458212, + "grad_norm": 0.15676546096801758, + "learning_rate": 6.892941523489189e-06, + "loss": 0.0015, + "step": 1335 + }, + { + "epoch": 1.1853247616936378, + "grad_norm": 0.17191384732723236, + "learning_rate": 6.888509463551307e-06, + "loss": 0.0018, + "step": 1336 + }, + { + "epoch": 1.1862114830414543, + "grad_norm": 0.06105402857065201, + "learning_rate": 6.8840756721775285e-06, + "loss": 0.0005, + "step": 1337 + }, + { + "epoch": 1.1870982043892706, + "grad_norm": 0.22978165745735168, + "learning_rate": 6.879640153432875e-06, + "loss": 0.0035, + "step": 1338 + }, + { + "epoch": 1.1879849257370871, + "grad_norm": 0.28052595257759094, + "learning_rate": 6.875202911383945e-06, + "loss": 0.0018, + "step": 1339 + }, + { + "epoch": 1.1888716470849037, + "grad_norm": 0.5201903581619263, + "learning_rate": 6.870763950098922e-06, + "loss": 0.0032, + "step": 1340 + }, + { + "epoch": 1.18975836843272, + "grad_norm": 0.13970860838890076, + "learning_rate": 6.866323273647564e-06, + "loss": 0.0006, + "step": 1341 + }, + { + "epoch": 1.1906450897805365, + "grad_norm": 0.0728214755654335, + "learning_rate": 6.8618808861012025e-06, + "loss": 0.0002, + "step": 1342 + }, + { + "epoch": 1.1915318111283528, + "grad_norm": 0.16536439955234528, + "learning_rate": 6.8574367915327345e-06, + "loss": 0.0006, + "step": 1343 + }, + { + "epoch": 1.1924185324761694, + "grad_norm": 0.48193657398223877, + "learning_rate": 6.852990994016627e-06, + "loss": 0.0034, + "step": 1344 + }, + { + "epoch": 1.193305253823986, + "grad_norm": 0.45254138112068176, + "learning_rate": 6.848543497628905e-06, + "loss": 0.0036, + "step": 1345 + }, + { + "epoch": 1.1941919751718022, + "grad_norm": 0.11150401830673218, + "learning_rate": 6.844094306447152e-06, + "loss": 0.0008, + "step": 1346 + }, + { + "epoch": 1.1950786965196187, + "grad_norm": 0.23752820491790771, + "learning_rate": 6.839643424550506e-06, + "loss": 0.0023, + "step": 1347 + }, + { + "epoch": 1.195965417867435, + "grad_norm": 0.012399467639625072, + "learning_rate": 6.835190856019654e-06, + "loss": 0.0001, + "step": 1348 + }, + { + "epoch": 1.1968521392152516, + "grad_norm": 0.33054450154304504, + "learning_rate": 6.8307366049368285e-06, + "loss": 0.0026, + "step": 1349 + }, + { + "epoch": 1.197738860563068, + "grad_norm": 0.714648425579071, + "learning_rate": 6.826280675385807e-06, + "loss": 0.0108, + "step": 1350 + }, + { + "epoch": 1.1986255819108844, + "grad_norm": 0.22860702872276306, + "learning_rate": 6.821823071451905e-06, + "loss": 0.0023, + "step": 1351 + }, + { + "epoch": 1.199512303258701, + "grad_norm": 0.6131689548492432, + "learning_rate": 6.817363797221971e-06, + "loss": 0.0072, + "step": 1352 + }, + { + "epoch": 1.2003990246065175, + "grad_norm": 0.2593003213405609, + "learning_rate": 6.812902856784388e-06, + "loss": 0.0014, + "step": 1353 + }, + { + "epoch": 1.2012857459543338, + "grad_norm": 0.177522674202919, + "learning_rate": 6.808440254229066e-06, + "loss": 0.0011, + "step": 1354 + }, + { + "epoch": 1.2021724673021503, + "grad_norm": 0.2457665652036667, + "learning_rate": 6.803975993647437e-06, + "loss": 0.0034, + "step": 1355 + }, + { + "epoch": 1.2030591886499669, + "grad_norm": 0.11679854243993759, + "learning_rate": 6.7995100791324545e-06, + "loss": 0.0012, + "step": 1356 + }, + { + "epoch": 1.2039459099977832, + "grad_norm": 0.07699250429868698, + "learning_rate": 6.7950425147785885e-06, + "loss": 0.0007, + "step": 1357 + }, + { + "epoch": 1.2048326313455997, + "grad_norm": 0.5148756504058838, + "learning_rate": 6.79057330468182e-06, + "loss": 0.0024, + "step": 1358 + }, + { + "epoch": 1.205719352693416, + "grad_norm": 0.1937873661518097, + "learning_rate": 6.786102452939642e-06, + "loss": 0.0033, + "step": 1359 + }, + { + "epoch": 1.2066060740412325, + "grad_norm": 0.362888365983963, + "learning_rate": 6.78162996365105e-06, + "loss": 0.0035, + "step": 1360 + }, + { + "epoch": 1.207492795389049, + "grad_norm": 0.3451005816459656, + "learning_rate": 6.7771558409165405e-06, + "loss": 0.0029, + "step": 1361 + }, + { + "epoch": 1.2083795167368654, + "grad_norm": 0.1482471525669098, + "learning_rate": 6.7726800888381105e-06, + "loss": 0.0009, + "step": 1362 + }, + { + "epoch": 1.209266238084682, + "grad_norm": 0.11421269923448563, + "learning_rate": 6.768202711519248e-06, + "loss": 0.0015, + "step": 1363 + }, + { + "epoch": 1.2101529594324982, + "grad_norm": 0.18156251311302185, + "learning_rate": 6.763723713064933e-06, + "loss": 0.0015, + "step": 1364 + }, + { + "epoch": 1.2110396807803148, + "grad_norm": 0.07586735486984253, + "learning_rate": 6.759243097581629e-06, + "loss": 0.0013, + "step": 1365 + }, + { + "epoch": 1.2119264021281313, + "grad_norm": 0.2878761887550354, + "learning_rate": 6.754760869177285e-06, + "loss": 0.0031, + "step": 1366 + }, + { + "epoch": 1.2128131234759476, + "grad_norm": 0.6761088967323303, + "learning_rate": 6.750277031961328e-06, + "loss": 0.0055, + "step": 1367 + }, + { + "epoch": 1.2136998448237641, + "grad_norm": 0.13870671391487122, + "learning_rate": 6.745791590044659e-06, + "loss": 0.0012, + "step": 1368 + }, + { + "epoch": 1.2145865661715807, + "grad_norm": 0.0526190847158432, + "learning_rate": 6.741304547539652e-06, + "loss": 0.0003, + "step": 1369 + }, + { + "epoch": 1.215473287519397, + "grad_norm": 0.6361402273178101, + "learning_rate": 6.7368159085601456e-06, + "loss": 0.0045, + "step": 1370 + }, + { + "epoch": 1.2163600088672135, + "grad_norm": 0.09560070186853409, + "learning_rate": 6.7323256772214455e-06, + "loss": 0.0006, + "step": 1371 + }, + { + "epoch": 1.21724673021503, + "grad_norm": 0.13184469938278198, + "learning_rate": 6.727833857640316e-06, + "loss": 0.0008, + "step": 1372 + }, + { + "epoch": 1.2181334515628464, + "grad_norm": 0.2193000614643097, + "learning_rate": 6.723340453934974e-06, + "loss": 0.0028, + "step": 1373 + }, + { + "epoch": 1.219020172910663, + "grad_norm": 0.13727962970733643, + "learning_rate": 6.718845470225095e-06, + "loss": 0.0008, + "step": 1374 + }, + { + "epoch": 1.2199068942584792, + "grad_norm": 0.042113423347473145, + "learning_rate": 6.7143489106318e-06, + "loss": 0.0003, + "step": 1375 + }, + { + "epoch": 1.2207936156062957, + "grad_norm": 0.1653071641921997, + "learning_rate": 6.709850779277653e-06, + "loss": 0.0012, + "step": 1376 + }, + { + "epoch": 1.2216803369541123, + "grad_norm": 0.21689732372760773, + "learning_rate": 6.705351080286664e-06, + "loss": 0.0012, + "step": 1377 + }, + { + "epoch": 1.2225670583019286, + "grad_norm": 0.19632472097873688, + "learning_rate": 6.700849817784274e-06, + "loss": 0.0017, + "step": 1378 + }, + { + "epoch": 1.223453779649745, + "grad_norm": 0.020970387384295464, + "learning_rate": 6.696346995897363e-06, + "loss": 0.0002, + "step": 1379 + }, + { + "epoch": 1.2243405009975614, + "grad_norm": 0.07651633024215698, + "learning_rate": 6.691842618754235e-06, + "loss": 0.0004, + "step": 1380 + }, + { + "epoch": 1.225227222345378, + "grad_norm": 0.1662818342447281, + "learning_rate": 6.687336690484626e-06, + "loss": 0.0011, + "step": 1381 + }, + { + "epoch": 1.2261139436931945, + "grad_norm": 0.19208109378814697, + "learning_rate": 6.682829215219692e-06, + "loss": 0.0012, + "step": 1382 + }, + { + "epoch": 1.2270006650410108, + "grad_norm": 0.09741144627332687, + "learning_rate": 6.678320197092004e-06, + "loss": 0.0006, + "step": 1383 + }, + { + "epoch": 1.2278873863888273, + "grad_norm": 0.17250119149684906, + "learning_rate": 6.673809640235552e-06, + "loss": 0.0012, + "step": 1384 + }, + { + "epoch": 1.2287741077366436, + "grad_norm": 0.13515061140060425, + "learning_rate": 6.6692975487857355e-06, + "loss": 0.0009, + "step": 1385 + }, + { + "epoch": 1.2296608290844602, + "grad_norm": 0.2506014406681061, + "learning_rate": 6.664783926879359e-06, + "loss": 0.0029, + "step": 1386 + }, + { + "epoch": 1.2305475504322767, + "grad_norm": 0.1211366206407547, + "learning_rate": 6.660268778654631e-06, + "loss": 0.0004, + "step": 1387 + }, + { + "epoch": 1.231434271780093, + "grad_norm": 0.20620831847190857, + "learning_rate": 6.655752108251161e-06, + "loss": 0.0008, + "step": 1388 + }, + { + "epoch": 1.2323209931279095, + "grad_norm": 0.785988450050354, + "learning_rate": 6.651233919809955e-06, + "loss": 0.0068, + "step": 1389 + }, + { + "epoch": 1.233207714475726, + "grad_norm": 0.35752174258232117, + "learning_rate": 6.6467142174734055e-06, + "loss": 0.0022, + "step": 1390 + }, + { + "epoch": 1.2340944358235424, + "grad_norm": 0.9221594929695129, + "learning_rate": 6.642193005385298e-06, + "loss": 0.0031, + "step": 1391 + }, + { + "epoch": 1.234981157171359, + "grad_norm": 0.4186972677707672, + "learning_rate": 6.6376702876908e-06, + "loss": 0.0023, + "step": 1392 + }, + { + "epoch": 1.2358678785191755, + "grad_norm": 0.18855422735214233, + "learning_rate": 6.63314606853646e-06, + "loss": 0.0005, + "step": 1393 + }, + { + "epoch": 1.2367545998669918, + "grad_norm": 0.047466155141592026, + "learning_rate": 6.628620352070201e-06, + "loss": 0.0003, + "step": 1394 + }, + { + "epoch": 1.2376413212148083, + "grad_norm": 0.17096386849880219, + "learning_rate": 6.624093142441323e-06, + "loss": 0.0006, + "step": 1395 + }, + { + "epoch": 1.2385280425626246, + "grad_norm": 0.038220904767513275, + "learning_rate": 6.619564443800494e-06, + "loss": 0.0003, + "step": 1396 + }, + { + "epoch": 1.2394147639104411, + "grad_norm": 0.006028464064002037, + "learning_rate": 6.615034260299742e-06, + "loss": 0.0, + "step": 1397 + }, + { + "epoch": 1.2403014852582577, + "grad_norm": 0.48989105224609375, + "learning_rate": 6.610502596092464e-06, + "loss": 0.004, + "step": 1398 + }, + { + "epoch": 1.241188206606074, + "grad_norm": 0.06377265602350235, + "learning_rate": 6.605969455333407e-06, + "loss": 0.0002, + "step": 1399 + }, + { + "epoch": 1.2420749279538905, + "grad_norm": 0.513083815574646, + "learning_rate": 6.601434842178678e-06, + "loss": 0.0088, + "step": 1400 + }, + { + "epoch": 1.2429616493017068, + "grad_norm": 0.038441240787506104, + "learning_rate": 6.596898760785731e-06, + "loss": 0.0002, + "step": 1401 + }, + { + "epoch": 1.2438483706495234, + "grad_norm": 1.2739007472991943, + "learning_rate": 6.592361215313365e-06, + "loss": 0.0077, + "step": 1402 + }, + { + "epoch": 1.24473509199734, + "grad_norm": 0.3760228157043457, + "learning_rate": 6.587822209921726e-06, + "loss": 0.005, + "step": 1403 + }, + { + "epoch": 1.2456218133451562, + "grad_norm": 0.2797979414463043, + "learning_rate": 6.583281748772291e-06, + "loss": 0.0008, + "step": 1404 + }, + { + "epoch": 1.2465085346929727, + "grad_norm": 0.30799371004104614, + "learning_rate": 6.5787398360278795e-06, + "loss": 0.0036, + "step": 1405 + }, + { + "epoch": 1.2473952560407893, + "grad_norm": 0.0893583670258522, + "learning_rate": 6.5741964758526375e-06, + "loss": 0.0005, + "step": 1406 + }, + { + "epoch": 1.2482819773886056, + "grad_norm": 0.5812922716140747, + "learning_rate": 6.569651672412038e-06, + "loss": 0.0091, + "step": 1407 + }, + { + "epoch": 1.2491686987364221, + "grad_norm": 0.4839387834072113, + "learning_rate": 6.56510542987288e-06, + "loss": 0.0051, + "step": 1408 + }, + { + "epoch": 1.2500554200842386, + "grad_norm": 0.04253406077623367, + "learning_rate": 6.560557752403277e-06, + "loss": 0.0005, + "step": 1409 + }, + { + "epoch": 1.250942141432055, + "grad_norm": 0.3183040916919708, + "learning_rate": 6.556008644172664e-06, + "loss": 0.0042, + "step": 1410 + }, + { + "epoch": 1.2518288627798715, + "grad_norm": 0.05635031685233116, + "learning_rate": 6.551458109351785e-06, + "loss": 0.0004, + "step": 1411 + }, + { + "epoch": 1.2527155841276878, + "grad_norm": 0.35388946533203125, + "learning_rate": 6.546906152112689e-06, + "loss": 0.0101, + "step": 1412 + }, + { + "epoch": 1.2536023054755043, + "grad_norm": 0.26416826248168945, + "learning_rate": 6.542352776628733e-06, + "loss": 0.0027, + "step": 1413 + }, + { + "epoch": 1.2544890268233209, + "grad_norm": 0.06187260150909424, + "learning_rate": 6.537797987074574e-06, + "loss": 0.0006, + "step": 1414 + }, + { + "epoch": 1.2553757481711372, + "grad_norm": 0.399342805147171, + "learning_rate": 6.533241787626165e-06, + "loss": 0.0059, + "step": 1415 + }, + { + "epoch": 1.2562624695189537, + "grad_norm": 0.34618619084358215, + "learning_rate": 6.5286841824607515e-06, + "loss": 0.0024, + "step": 1416 + }, + { + "epoch": 1.25714919086677, + "grad_norm": 0.08505801856517792, + "learning_rate": 6.524125175756866e-06, + "loss": 0.001, + "step": 1417 + }, + { + "epoch": 1.2580359122145865, + "grad_norm": 0.28683024644851685, + "learning_rate": 6.519564771694328e-06, + "loss": 0.0032, + "step": 1418 + }, + { + "epoch": 1.258922633562403, + "grad_norm": 0.1888272911310196, + "learning_rate": 6.515002974454239e-06, + "loss": 0.0019, + "step": 1419 + }, + { + "epoch": 1.2598093549102194, + "grad_norm": 0.10655654221773148, + "learning_rate": 6.510439788218975e-06, + "loss": 0.001, + "step": 1420 + }, + { + "epoch": 1.260696076258036, + "grad_norm": 0.16792140901088715, + "learning_rate": 6.505875217172188e-06, + "loss": 0.0017, + "step": 1421 + }, + { + "epoch": 1.2615827976058522, + "grad_norm": 0.5658694505691528, + "learning_rate": 6.501309265498797e-06, + "loss": 0.0057, + "step": 1422 + }, + { + "epoch": 1.2624695189536688, + "grad_norm": 0.3947063386440277, + "learning_rate": 6.496741937384992e-06, + "loss": 0.003, + "step": 1423 + }, + { + "epoch": 1.2633562403014853, + "grad_norm": 0.23458246886730194, + "learning_rate": 6.492173237018217e-06, + "loss": 0.0049, + "step": 1424 + }, + { + "epoch": 1.2642429616493018, + "grad_norm": 0.2951560616493225, + "learning_rate": 6.48760316858718e-06, + "loss": 0.0042, + "step": 1425 + }, + { + "epoch": 1.2651296829971181, + "grad_norm": 0.10598162561655045, + "learning_rate": 6.483031736281843e-06, + "loss": 0.0009, + "step": 1426 + }, + { + "epoch": 1.2660164043449347, + "grad_norm": 0.2090543806552887, + "learning_rate": 6.478458944293416e-06, + "loss": 0.0011, + "step": 1427 + }, + { + "epoch": 1.266903125692751, + "grad_norm": 0.18498195707798004, + "learning_rate": 6.473884796814357e-06, + "loss": 0.0019, + "step": 1428 + }, + { + "epoch": 1.2677898470405675, + "grad_norm": 0.3451266288757324, + "learning_rate": 6.469309298038367e-06, + "loss": 0.0038, + "step": 1429 + }, + { + "epoch": 1.268676568388384, + "grad_norm": 0.35533058643341064, + "learning_rate": 6.4647324521603845e-06, + "loss": 0.0062, + "step": 1430 + }, + { + "epoch": 1.2695632897362004, + "grad_norm": 0.1544898897409439, + "learning_rate": 6.4601542633765834e-06, + "loss": 0.0011, + "step": 1431 + }, + { + "epoch": 1.270450011084017, + "grad_norm": 0.4989369809627533, + "learning_rate": 6.455574735884373e-06, + "loss": 0.0028, + "step": 1432 + }, + { + "epoch": 1.2713367324318332, + "grad_norm": 0.13420936465263367, + "learning_rate": 6.450993873882383e-06, + "loss": 0.0007, + "step": 1433 + }, + { + "epoch": 1.2722234537796497, + "grad_norm": 0.046916741877794266, + "learning_rate": 6.44641168157047e-06, + "loss": 0.0004, + "step": 1434 + }, + { + "epoch": 1.2731101751274663, + "grad_norm": 0.7304254174232483, + "learning_rate": 6.44182816314971e-06, + "loss": 0.0039, + "step": 1435 + }, + { + "epoch": 1.2739968964752826, + "grad_norm": 0.979017972946167, + "learning_rate": 6.4372433228224e-06, + "loss": 0.0069, + "step": 1436 + }, + { + "epoch": 1.2748836178230991, + "grad_norm": 0.054384853690862656, + "learning_rate": 6.432657164792038e-06, + "loss": 0.0004, + "step": 1437 + }, + { + "epoch": 1.2757703391709154, + "grad_norm": 0.21733501553535461, + "learning_rate": 6.428069693263337e-06, + "loss": 0.0024, + "step": 1438 + }, + { + "epoch": 1.276657060518732, + "grad_norm": 0.3343576192855835, + "learning_rate": 6.423480912442216e-06, + "loss": 0.0031, + "step": 1439 + }, + { + "epoch": 1.2775437818665485, + "grad_norm": 0.2468477189540863, + "learning_rate": 6.418890826535791e-06, + "loss": 0.0018, + "step": 1440 + }, + { + "epoch": 1.278430503214365, + "grad_norm": 0.38270869851112366, + "learning_rate": 6.4142994397523735e-06, + "loss": 0.004, + "step": 1441 + }, + { + "epoch": 1.2793172245621813, + "grad_norm": 0.03215061128139496, + "learning_rate": 6.409706756301473e-06, + "loss": 0.0003, + "step": 1442 + }, + { + "epoch": 1.2802039459099979, + "grad_norm": 0.06875225156545639, + "learning_rate": 6.405112780393781e-06, + "loss": 0.0005, + "step": 1443 + }, + { + "epoch": 1.2810906672578142, + "grad_norm": 0.21331429481506348, + "learning_rate": 6.400517516241178e-06, + "loss": 0.0007, + "step": 1444 + }, + { + "epoch": 1.2819773886056307, + "grad_norm": 0.21490368247032166, + "learning_rate": 6.395920968056725e-06, + "loss": 0.0005, + "step": 1445 + }, + { + "epoch": 1.2828641099534472, + "grad_norm": 0.09645006060600281, + "learning_rate": 6.39132314005466e-06, + "loss": 0.0007, + "step": 1446 + }, + { + "epoch": 1.2837508313012636, + "grad_norm": 0.3721943795681, + "learning_rate": 6.386724036450396e-06, + "loss": 0.0065, + "step": 1447 + }, + { + "epoch": 1.28463755264908, + "grad_norm": 0.22285206615924835, + "learning_rate": 6.382123661460511e-06, + "loss": 0.0015, + "step": 1448 + }, + { + "epoch": 1.2855242739968964, + "grad_norm": 0.08951204270124435, + "learning_rate": 6.377522019302756e-06, + "loss": 0.0007, + "step": 1449 + }, + { + "epoch": 1.286410995344713, + "grad_norm": 0.23946380615234375, + "learning_rate": 6.372919114196035e-06, + "loss": 0.0013, + "step": 1450 + }, + { + "epoch": 1.2872977166925295, + "grad_norm": 0.05512690171599388, + "learning_rate": 6.368314950360416e-06, + "loss": 0.0004, + "step": 1451 + }, + { + "epoch": 1.2881844380403458, + "grad_norm": 0.43557584285736084, + "learning_rate": 6.3637095320171185e-06, + "loss": 0.0027, + "step": 1452 + }, + { + "epoch": 1.2890711593881623, + "grad_norm": 0.27949899435043335, + "learning_rate": 6.359102863388515e-06, + "loss": 0.0013, + "step": 1453 + }, + { + "epoch": 1.2899578807359786, + "grad_norm": 0.18375392258167267, + "learning_rate": 6.35449494869812e-06, + "loss": 0.001, + "step": 1454 + }, + { + "epoch": 1.2908446020837951, + "grad_norm": 0.1614723652601242, + "learning_rate": 6.349885792170595e-06, + "loss": 0.0009, + "step": 1455 + }, + { + "epoch": 1.2917313234316117, + "grad_norm": 0.13602754473686218, + "learning_rate": 6.345275398031736e-06, + "loss": 0.0007, + "step": 1456 + }, + { + "epoch": 1.292618044779428, + "grad_norm": 0.22377264499664307, + "learning_rate": 6.340663770508476e-06, + "loss": 0.0014, + "step": 1457 + }, + { + "epoch": 1.2935047661272445, + "grad_norm": 0.7576924562454224, + "learning_rate": 6.33605091382888e-06, + "loss": 0.0027, + "step": 1458 + }, + { + "epoch": 1.2943914874750608, + "grad_norm": 0.019061360508203506, + "learning_rate": 6.331436832222133e-06, + "loss": 0.0001, + "step": 1459 + }, + { + "epoch": 1.2952782088228774, + "grad_norm": 0.2781587839126587, + "learning_rate": 6.3268215299185545e-06, + "loss": 0.0011, + "step": 1460 + }, + { + "epoch": 1.296164930170694, + "grad_norm": 0.0837785005569458, + "learning_rate": 6.3222050111495715e-06, + "loss": 0.0004, + "step": 1461 + }, + { + "epoch": 1.2970516515185104, + "grad_norm": 0.37472015619277954, + "learning_rate": 6.317587280147737e-06, + "loss": 0.0069, + "step": 1462 + }, + { + "epoch": 1.2979383728663267, + "grad_norm": 0.09563448280096054, + "learning_rate": 6.312968341146705e-06, + "loss": 0.0004, + "step": 1463 + }, + { + "epoch": 1.2988250942141433, + "grad_norm": 0.029751967638731003, + "learning_rate": 6.308348198381244e-06, + "loss": 0.0002, + "step": 1464 + }, + { + "epoch": 1.2997118155619596, + "grad_norm": 0.5495652556419373, + "learning_rate": 6.303726856087227e-06, + "loss": 0.0031, + "step": 1465 + }, + { + "epoch": 1.3005985369097761, + "grad_norm": 0.20881287753582, + "learning_rate": 6.299104318501619e-06, + "loss": 0.0007, + "step": 1466 + }, + { + "epoch": 1.3014852582575926, + "grad_norm": 0.5076218843460083, + "learning_rate": 6.294480589862491e-06, + "loss": 0.0035, + "step": 1467 + }, + { + "epoch": 1.302371979605409, + "grad_norm": 0.3442370593547821, + "learning_rate": 6.289855674408995e-06, + "loss": 0.0034, + "step": 1468 + }, + { + "epoch": 1.3032587009532255, + "grad_norm": 0.9615652561187744, + "learning_rate": 6.285229576381382e-06, + "loss": 0.0082, + "step": 1469 + }, + { + "epoch": 1.3041454223010418, + "grad_norm": 0.042115919291973114, + "learning_rate": 6.280602300020982e-06, + "loss": 0.0002, + "step": 1470 + }, + { + "epoch": 1.3050321436488583, + "grad_norm": 0.15722385048866272, + "learning_rate": 6.275973849570201e-06, + "loss": 0.0008, + "step": 1471 + }, + { + "epoch": 1.3059188649966749, + "grad_norm": 0.011659053154289722, + "learning_rate": 6.27134422927253e-06, + "loss": 0.0001, + "step": 1472 + }, + { + "epoch": 1.3068055863444912, + "grad_norm": 0.05210238695144653, + "learning_rate": 6.266713443372527e-06, + "loss": 0.0003, + "step": 1473 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 0.6101747751235962, + "learning_rate": 6.262081496115822e-06, + "loss": 0.0065, + "step": 1474 + }, + { + "epoch": 1.308579029040124, + "grad_norm": 0.5870680212974548, + "learning_rate": 6.257448391749104e-06, + "loss": 0.0044, + "step": 1475 + }, + { + "epoch": 1.3094657503879406, + "grad_norm": 0.03537415340542793, + "learning_rate": 6.252814134520131e-06, + "loss": 0.0003, + "step": 1476 + }, + { + "epoch": 1.310352471735757, + "grad_norm": 0.23479506373405457, + "learning_rate": 6.2481787286777116e-06, + "loss": 0.0007, + "step": 1477 + }, + { + "epoch": 1.3112391930835736, + "grad_norm": 0.6235648989677429, + "learning_rate": 6.24354217847171e-06, + "loss": 0.0072, + "step": 1478 + }, + { + "epoch": 1.31212591443139, + "grad_norm": 0.5917138457298279, + "learning_rate": 6.238904488153039e-06, + "loss": 0.0114, + "step": 1479 + }, + { + "epoch": 1.3130126357792065, + "grad_norm": 0.3008297085762024, + "learning_rate": 6.234265661973656e-06, + "loss": 0.0026, + "step": 1480 + }, + { + "epoch": 1.3138993571270228, + "grad_norm": 0.2608731687068939, + "learning_rate": 6.229625704186564e-06, + "loss": 0.0043, + "step": 1481 + }, + { + "epoch": 1.3147860784748393, + "grad_norm": 0.46215519309043884, + "learning_rate": 6.2249846190457964e-06, + "loss": 0.0085, + "step": 1482 + }, + { + "epoch": 1.3156727998226558, + "grad_norm": 0.10965245962142944, + "learning_rate": 6.220342410806426e-06, + "loss": 0.0006, + "step": 1483 + }, + { + "epoch": 1.3165595211704721, + "grad_norm": 0.10256587713956833, + "learning_rate": 6.215699083724553e-06, + "loss": 0.0007, + "step": 1484 + }, + { + "epoch": 1.3174462425182887, + "grad_norm": 0.03453338146209717, + "learning_rate": 6.211054642057301e-06, + "loss": 0.0004, + "step": 1485 + }, + { + "epoch": 1.318332963866105, + "grad_norm": 0.14215219020843506, + "learning_rate": 6.206409090062823e-06, + "loss": 0.0011, + "step": 1486 + }, + { + "epoch": 1.3192196852139215, + "grad_norm": 0.30575263500213623, + "learning_rate": 6.201762432000283e-06, + "loss": 0.002, + "step": 1487 + }, + { + "epoch": 1.320106406561738, + "grad_norm": 0.19290179014205933, + "learning_rate": 6.197114672129859e-06, + "loss": 0.0011, + "step": 1488 + }, + { + "epoch": 1.3209931279095544, + "grad_norm": 0.10399499535560608, + "learning_rate": 6.192465814712744e-06, + "loss": 0.0008, + "step": 1489 + }, + { + "epoch": 1.321879849257371, + "grad_norm": 0.20446741580963135, + "learning_rate": 6.187815864011133e-06, + "loss": 0.0012, + "step": 1490 + }, + { + "epoch": 1.3227665706051872, + "grad_norm": 0.07639744877815247, + "learning_rate": 6.183164824288226e-06, + "loss": 0.001, + "step": 1491 + }, + { + "epoch": 1.3236532919530037, + "grad_norm": 0.04390874132514, + "learning_rate": 6.17851269980822e-06, + "loss": 0.0006, + "step": 1492 + }, + { + "epoch": 1.3245400133008203, + "grad_norm": 0.20111678540706635, + "learning_rate": 6.173859494836309e-06, + "loss": 0.0012, + "step": 1493 + }, + { + "epoch": 1.3254267346486366, + "grad_norm": 0.1883375197649002, + "learning_rate": 6.169205213638671e-06, + "loss": 0.0029, + "step": 1494 + }, + { + "epoch": 1.3263134559964531, + "grad_norm": 0.1244998425245285, + "learning_rate": 6.1645498604824785e-06, + "loss": 0.0004, + "step": 1495 + }, + { + "epoch": 1.3272001773442694, + "grad_norm": 0.11627326160669327, + "learning_rate": 6.1598934396358826e-06, + "loss": 0.0012, + "step": 1496 + }, + { + "epoch": 1.328086898692086, + "grad_norm": 0.12732315063476562, + "learning_rate": 6.155235955368014e-06, + "loss": 0.0008, + "step": 1497 + }, + { + "epoch": 1.3289736200399025, + "grad_norm": 0.18036852777004242, + "learning_rate": 6.1505774119489805e-06, + "loss": 0.0009, + "step": 1498 + }, + { + "epoch": 1.329860341387719, + "grad_norm": 0.2986014783382416, + "learning_rate": 6.145917813649856e-06, + "loss": 0.0022, + "step": 1499 + }, + { + "epoch": 1.3307470627355353, + "grad_norm": 0.11444338411092758, + "learning_rate": 6.141257164742689e-06, + "loss": 0.0009, + "step": 1500 + }, + { + "epoch": 1.3316337840833519, + "grad_norm": 0.3222125470638275, + "learning_rate": 6.1365954695004815e-06, + "loss": 0.0024, + "step": 1501 + }, + { + "epoch": 1.3325205054311682, + "grad_norm": 0.17182210087776184, + "learning_rate": 6.131932732197205e-06, + "loss": 0.0005, + "step": 1502 + }, + { + "epoch": 1.3334072267789847, + "grad_norm": 0.5611296892166138, + "learning_rate": 6.127268957107777e-06, + "loss": 0.0045, + "step": 1503 + }, + { + "epoch": 1.3342939481268012, + "grad_norm": 0.11059463024139404, + "learning_rate": 6.122604148508076e-06, + "loss": 0.0005, + "step": 1504 + }, + { + "epoch": 1.3351806694746176, + "grad_norm": 0.4216887354850769, + "learning_rate": 6.117938310674918e-06, + "loss": 0.0023, + "step": 1505 + }, + { + "epoch": 1.336067390822434, + "grad_norm": 0.04834161698818207, + "learning_rate": 6.113271447886071e-06, + "loss": 0.0002, + "step": 1506 + }, + { + "epoch": 1.3369541121702504, + "grad_norm": 0.044341325759887695, + "learning_rate": 6.108603564420239e-06, + "loss": 0.0002, + "step": 1507 + }, + { + "epoch": 1.337840833518067, + "grad_norm": 0.5602015256881714, + "learning_rate": 6.103934664557061e-06, + "loss": 0.0045, + "step": 1508 + }, + { + "epoch": 1.3387275548658835, + "grad_norm": 0.10628864914178848, + "learning_rate": 6.09926475257711e-06, + "loss": 0.0005, + "step": 1509 + }, + { + "epoch": 1.3396142762136998, + "grad_norm": 0.47171446681022644, + "learning_rate": 6.0945938327618855e-06, + "loss": 0.0074, + "step": 1510 + }, + { + "epoch": 1.3405009975615163, + "grad_norm": 0.41396036744117737, + "learning_rate": 6.089921909393812e-06, + "loss": 0.0027, + "step": 1511 + }, + { + "epoch": 1.3413877189093326, + "grad_norm": 0.5207869410514832, + "learning_rate": 6.085248986756233e-06, + "loss": 0.0069, + "step": 1512 + }, + { + "epoch": 1.3422744402571491, + "grad_norm": 0.057785842567682266, + "learning_rate": 6.08057506913341e-06, + "loss": 0.0004, + "step": 1513 + }, + { + "epoch": 1.3431611616049657, + "grad_norm": 0.6101086735725403, + "learning_rate": 6.0759001608105126e-06, + "loss": 0.0066, + "step": 1514 + }, + { + "epoch": 1.3440478829527822, + "grad_norm": 0.4098615050315857, + "learning_rate": 6.071224266073625e-06, + "loss": 0.0055, + "step": 1515 + }, + { + "epoch": 1.3449346043005985, + "grad_norm": 0.33896300196647644, + "learning_rate": 6.066547389209731e-06, + "loss": 0.0023, + "step": 1516 + }, + { + "epoch": 1.345821325648415, + "grad_norm": 0.49446535110473633, + "learning_rate": 6.061869534506715e-06, + "loss": 0.0073, + "step": 1517 + }, + { + "epoch": 1.3467080469962314, + "grad_norm": 0.15998847782611847, + "learning_rate": 6.0571907062533604e-06, + "loss": 0.0008, + "step": 1518 + }, + { + "epoch": 1.347594768344048, + "grad_norm": 0.2155783772468567, + "learning_rate": 6.052510908739342e-06, + "loss": 0.0019, + "step": 1519 + }, + { + "epoch": 1.3484814896918644, + "grad_norm": 0.41385430097579956, + "learning_rate": 6.047830146255221e-06, + "loss": 0.0053, + "step": 1520 + }, + { + "epoch": 1.3493682110396807, + "grad_norm": 0.11063788831233978, + "learning_rate": 6.04314842309245e-06, + "loss": 0.0013, + "step": 1521 + }, + { + "epoch": 1.3502549323874973, + "grad_norm": 0.09609077125787735, + "learning_rate": 6.038465743543353e-06, + "loss": 0.0007, + "step": 1522 + }, + { + "epoch": 1.3511416537353136, + "grad_norm": 0.12723086774349213, + "learning_rate": 6.033782111901138e-06, + "loss": 0.0008, + "step": 1523 + }, + { + "epoch": 1.3520283750831301, + "grad_norm": 1.1818429231643677, + "learning_rate": 6.029097532459881e-06, + "loss": 0.0073, + "step": 1524 + }, + { + "epoch": 1.3529150964309467, + "grad_norm": 0.09564555436372757, + "learning_rate": 6.024412009514533e-06, + "loss": 0.001, + "step": 1525 + }, + { + "epoch": 1.353801817778763, + "grad_norm": 0.13707663118839264, + "learning_rate": 6.019725547360905e-06, + "loss": 0.0011, + "step": 1526 + }, + { + "epoch": 1.3546885391265795, + "grad_norm": 0.5464258790016174, + "learning_rate": 6.01503815029567e-06, + "loss": 0.0058, + "step": 1527 + }, + { + "epoch": 1.3555752604743958, + "grad_norm": 0.29684606194496155, + "learning_rate": 6.01034982261636e-06, + "loss": 0.0018, + "step": 1528 + }, + { + "epoch": 1.3564619818222123, + "grad_norm": 0.03409336507320404, + "learning_rate": 6.005660568621359e-06, + "loss": 0.0004, + "step": 1529 + }, + { + "epoch": 1.3573487031700289, + "grad_norm": 0.14027540385723114, + "learning_rate": 6.000970392609899e-06, + "loss": 0.0012, + "step": 1530 + }, + { + "epoch": 1.3582354245178454, + "grad_norm": 0.4142434597015381, + "learning_rate": 5.996279298882058e-06, + "loss": 0.0038, + "step": 1531 + }, + { + "epoch": 1.3591221458656617, + "grad_norm": 0.06057576835155487, + "learning_rate": 5.991587291738759e-06, + "loss": 0.0007, + "step": 1532 + }, + { + "epoch": 1.3600088672134782, + "grad_norm": 0.3078071177005768, + "learning_rate": 5.986894375481757e-06, + "loss": 0.0019, + "step": 1533 + }, + { + "epoch": 1.3608955885612946, + "grad_norm": 0.31704726815223694, + "learning_rate": 5.982200554413643e-06, + "loss": 0.0019, + "step": 1534 + }, + { + "epoch": 1.361782309909111, + "grad_norm": 0.08941329270601273, + "learning_rate": 5.977505832837839e-06, + "loss": 0.0005, + "step": 1535 + }, + { + "epoch": 1.3626690312569276, + "grad_norm": 0.5131560564041138, + "learning_rate": 5.9728102150585885e-06, + "loss": 0.0019, + "step": 1536 + }, + { + "epoch": 1.363555752604744, + "grad_norm": 0.11163872480392456, + "learning_rate": 5.968113705380961e-06, + "loss": 0.0007, + "step": 1537 + }, + { + "epoch": 1.3644424739525605, + "grad_norm": 0.2688218355178833, + "learning_rate": 5.963416308110843e-06, + "loss": 0.0017, + "step": 1538 + }, + { + "epoch": 1.3653291953003768, + "grad_norm": 0.1860375553369522, + "learning_rate": 5.9587180275549324e-06, + "loss": 0.0027, + "step": 1539 + }, + { + "epoch": 1.3662159166481933, + "grad_norm": 0.57248455286026, + "learning_rate": 5.954018868020737e-06, + "loss": 0.006, + "step": 1540 + }, + { + "epoch": 1.3671026379960098, + "grad_norm": 0.47507718205451965, + "learning_rate": 5.9493188338165745e-06, + "loss": 0.0038, + "step": 1541 + }, + { + "epoch": 1.3679893593438262, + "grad_norm": 0.04093353450298309, + "learning_rate": 5.94461792925156e-06, + "loss": 0.0003, + "step": 1542 + }, + { + "epoch": 1.3688760806916427, + "grad_norm": 0.2782546281814575, + "learning_rate": 5.939916158635608e-06, + "loss": 0.0015, + "step": 1543 + }, + { + "epoch": 1.369762802039459, + "grad_norm": 0.444394588470459, + "learning_rate": 5.935213526279428e-06, + "loss": 0.0028, + "step": 1544 + }, + { + "epoch": 1.3706495233872755, + "grad_norm": 0.6129445433616638, + "learning_rate": 5.93051003649452e-06, + "loss": 0.0055, + "step": 1545 + }, + { + "epoch": 1.371536244735092, + "grad_norm": 0.3048208951950073, + "learning_rate": 5.925805693593167e-06, + "loss": 0.0016, + "step": 1546 + }, + { + "epoch": 1.3724229660829084, + "grad_norm": 0.013988547958433628, + "learning_rate": 5.921100501888437e-06, + "loss": 0.0001, + "step": 1547 + }, + { + "epoch": 1.373309687430725, + "grad_norm": 0.05953359231352806, + "learning_rate": 5.916394465694175e-06, + "loss": 0.0004, + "step": 1548 + }, + { + "epoch": 1.3741964087785412, + "grad_norm": 0.12987588346004486, + "learning_rate": 5.911687589325003e-06, + "loss": 0.0006, + "step": 1549 + }, + { + "epoch": 1.3750831301263577, + "grad_norm": 0.18447338044643402, + "learning_rate": 5.906979877096306e-06, + "loss": 0.0009, + "step": 1550 + }, + { + "epoch": 1.3759698514741743, + "grad_norm": 0.05198747292160988, + "learning_rate": 5.902271333324245e-06, + "loss": 0.0001, + "step": 1551 + }, + { + "epoch": 1.3768565728219908, + "grad_norm": 0.3160160183906555, + "learning_rate": 5.897561962325737e-06, + "loss": 0.0011, + "step": 1552 + }, + { + "epoch": 1.3777432941698071, + "grad_norm": 0.2100251019001007, + "learning_rate": 5.892851768418458e-06, + "loss": 0.0007, + "step": 1553 + }, + { + "epoch": 1.3786300155176237, + "grad_norm": 0.06642001122236252, + "learning_rate": 5.88814075592084e-06, + "loss": 0.0005, + "step": 1554 + }, + { + "epoch": 1.37951673686544, + "grad_norm": 0.08171337842941284, + "learning_rate": 5.883428929152067e-06, + "loss": 0.0003, + "step": 1555 + }, + { + "epoch": 1.3804034582132565, + "grad_norm": 0.4593549072742462, + "learning_rate": 5.878716292432065e-06, + "loss": 0.0023, + "step": 1556 + }, + { + "epoch": 1.381290179561073, + "grad_norm": 0.2839002013206482, + "learning_rate": 5.874002850081506e-06, + "loss": 0.0015, + "step": 1557 + }, + { + "epoch": 1.3821769009088893, + "grad_norm": 0.49203285574913025, + "learning_rate": 5.8692886064218015e-06, + "loss": 0.0011, + "step": 1558 + }, + { + "epoch": 1.3830636222567059, + "grad_norm": 0.018628446385264397, + "learning_rate": 5.864573565775092e-06, + "loss": 0.0001, + "step": 1559 + }, + { + "epoch": 1.3839503436045222, + "grad_norm": 0.716225266456604, + "learning_rate": 5.859857732464258e-06, + "loss": 0.0057, + "step": 1560 + }, + { + "epoch": 1.3848370649523387, + "grad_norm": 0.082828588783741, + "learning_rate": 5.855141110812896e-06, + "loss": 0.0002, + "step": 1561 + }, + { + "epoch": 1.3857237863001552, + "grad_norm": 0.5517600178718567, + "learning_rate": 5.850423705145334e-06, + "loss": 0.0026, + "step": 1562 + }, + { + "epoch": 1.3866105076479716, + "grad_norm": 0.3616451025009155, + "learning_rate": 5.845705519786613e-06, + "loss": 0.0007, + "step": 1563 + }, + { + "epoch": 1.387497228995788, + "grad_norm": 0.39561301469802856, + "learning_rate": 5.8409865590624955e-06, + "loss": 0.004, + "step": 1564 + }, + { + "epoch": 1.3883839503436044, + "grad_norm": 0.3806546926498413, + "learning_rate": 5.836266827299446e-06, + "loss": 0.0043, + "step": 1565 + }, + { + "epoch": 1.389270671691421, + "grad_norm": 0.005371665582060814, + "learning_rate": 5.831546328824643e-06, + "loss": 0.0, + "step": 1566 + }, + { + "epoch": 1.3901573930392375, + "grad_norm": 1.6570253372192383, + "learning_rate": 5.826825067965967e-06, + "loss": 0.0176, + "step": 1567 + }, + { + "epoch": 1.391044114387054, + "grad_norm": 0.19763243198394775, + "learning_rate": 5.822103049051993e-06, + "loss": 0.0009, + "step": 1568 + }, + { + "epoch": 1.3919308357348703, + "grad_norm": 1.0521655082702637, + "learning_rate": 5.817380276411995e-06, + "loss": 0.0014, + "step": 1569 + }, + { + "epoch": 1.3928175570826868, + "grad_norm": 0.4388374984264374, + "learning_rate": 5.812656754375937e-06, + "loss": 0.0145, + "step": 1570 + }, + { + "epoch": 1.3937042784305032, + "grad_norm": 4.534085750579834, + "learning_rate": 5.8079324872744706e-06, + "loss": 0.0201, + "step": 1571 + }, + { + "epoch": 1.3945909997783197, + "grad_norm": 0.01597362942993641, + "learning_rate": 5.803207479438929e-06, + "loss": 0.0001, + "step": 1572 + }, + { + "epoch": 1.3954777211261362, + "grad_norm": 0.13055969774723053, + "learning_rate": 5.798481735201327e-06, + "loss": 0.0009, + "step": 1573 + }, + { + "epoch": 1.3963644424739525, + "grad_norm": 0.13696056604385376, + "learning_rate": 5.793755258894354e-06, + "loss": 0.0003, + "step": 1574 + }, + { + "epoch": 1.397251163821769, + "grad_norm": 0.11396967619657516, + "learning_rate": 5.789028054851366e-06, + "loss": 0.0005, + "step": 1575 + }, + { + "epoch": 1.3981378851695854, + "grad_norm": 0.059259578585624695, + "learning_rate": 5.784300127406393e-06, + "loss": 0.0003, + "step": 1576 + }, + { + "epoch": 1.399024606517402, + "grad_norm": 0.2292395532131195, + "learning_rate": 5.779571480894122e-06, + "loss": 0.0013, + "step": 1577 + }, + { + "epoch": 1.3999113278652184, + "grad_norm": 0.06261775642633438, + "learning_rate": 5.774842119649905e-06, + "loss": 0.0004, + "step": 1578 + }, + { + "epoch": 1.4007980492130347, + "grad_norm": 0.27105244994163513, + "learning_rate": 5.770112048009747e-06, + "loss": 0.0025, + "step": 1579 + }, + { + "epoch": 1.4016847705608513, + "grad_norm": 0.12989850342273712, + "learning_rate": 5.765381270310302e-06, + "loss": 0.0005, + "step": 1580 + }, + { + "epoch": 1.4025714919086676, + "grad_norm": 0.43756383657455444, + "learning_rate": 5.7606497908888755e-06, + "loss": 0.004, + "step": 1581 + }, + { + "epoch": 1.4034582132564841, + "grad_norm": 0.27995091676712036, + "learning_rate": 5.755917614083412e-06, + "loss": 0.0026, + "step": 1582 + }, + { + "epoch": 1.4043449346043007, + "grad_norm": 0.15632747113704681, + "learning_rate": 5.751184744232499e-06, + "loss": 0.0009, + "step": 1583 + }, + { + "epoch": 1.405231655952117, + "grad_norm": 1.14409339427948, + "learning_rate": 5.746451185675358e-06, + "loss": 0.004, + "step": 1584 + }, + { + "epoch": 1.4061183772999335, + "grad_norm": 0.19565550982952118, + "learning_rate": 5.7417169427518394e-06, + "loss": 0.0016, + "step": 1585 + }, + { + "epoch": 1.4070050986477498, + "grad_norm": 0.43583473563194275, + "learning_rate": 5.736982019802427e-06, + "loss": 0.0039, + "step": 1586 + }, + { + "epoch": 1.4078918199955663, + "grad_norm": 0.3083972632884979, + "learning_rate": 5.732246421168223e-06, + "loss": 0.0021, + "step": 1587 + }, + { + "epoch": 1.4087785413433829, + "grad_norm": 0.4371331036090851, + "learning_rate": 5.727510151190949e-06, + "loss": 0.0037, + "step": 1588 + }, + { + "epoch": 1.4096652626911994, + "grad_norm": 0.24955230951309204, + "learning_rate": 5.722773214212946e-06, + "loss": 0.0046, + "step": 1589 + }, + { + "epoch": 1.4105519840390157, + "grad_norm": 0.18335281312465668, + "learning_rate": 5.718035614577164e-06, + "loss": 0.001, + "step": 1590 + }, + { + "epoch": 1.4114387053868322, + "grad_norm": 0.4750370979309082, + "learning_rate": 5.713297356627161e-06, + "loss": 0.0046, + "step": 1591 + }, + { + "epoch": 1.4123254267346486, + "grad_norm": 0.1709708869457245, + "learning_rate": 5.708558444707097e-06, + "loss": 0.0012, + "step": 1592 + }, + { + "epoch": 1.413212148082465, + "grad_norm": 0.6528603434562683, + "learning_rate": 5.7038188831617345e-06, + "loss": 0.0042, + "step": 1593 + }, + { + "epoch": 1.4140988694302816, + "grad_norm": 0.25174710154533386, + "learning_rate": 5.699078676336429e-06, + "loss": 0.0017, + "step": 1594 + }, + { + "epoch": 1.414985590778098, + "grad_norm": 0.07705551385879517, + "learning_rate": 5.69433782857713e-06, + "loss": 0.0004, + "step": 1595 + }, + { + "epoch": 1.4158723121259145, + "grad_norm": 0.10054202377796173, + "learning_rate": 5.68959634423037e-06, + "loss": 0.0005, + "step": 1596 + }, + { + "epoch": 1.4167590334737308, + "grad_norm": 0.09865143895149231, + "learning_rate": 5.684854227643274e-06, + "loss": 0.0006, + "step": 1597 + }, + { + "epoch": 1.4176457548215473, + "grad_norm": 0.2448788434267044, + "learning_rate": 5.680111483163535e-06, + "loss": 0.0023, + "step": 1598 + }, + { + "epoch": 1.4185324761693638, + "grad_norm": 0.14839304983615875, + "learning_rate": 5.675368115139432e-06, + "loss": 0.001, + "step": 1599 + }, + { + "epoch": 1.4194191975171802, + "grad_norm": 0.3813326060771942, + "learning_rate": 5.67062412791981e-06, + "loss": 0.0071, + "step": 1600 + }, + { + "epoch": 1.4203059188649967, + "grad_norm": 0.3784043490886688, + "learning_rate": 5.665879525854083e-06, + "loss": 0.0021, + "step": 1601 + }, + { + "epoch": 1.421192640212813, + "grad_norm": 0.7230820655822754, + "learning_rate": 5.661134313292232e-06, + "loss": 0.0088, + "step": 1602 + }, + { + "epoch": 1.4220793615606295, + "grad_norm": 0.3506825566291809, + "learning_rate": 5.656388494584788e-06, + "loss": 0.0071, + "step": 1603 + }, + { + "epoch": 1.422966082908446, + "grad_norm": 0.1567530781030655, + "learning_rate": 5.651642074082849e-06, + "loss": 0.0038, + "step": 1604 + }, + { + "epoch": 1.4238528042562626, + "grad_norm": 0.20204611122608185, + "learning_rate": 5.646895056138059e-06, + "loss": 0.002, + "step": 1605 + }, + { + "epoch": 1.424739525604079, + "grad_norm": 0.5711863040924072, + "learning_rate": 5.64214744510261e-06, + "loss": 0.0041, + "step": 1606 + }, + { + "epoch": 1.4256262469518954, + "grad_norm": 0.1815640926361084, + "learning_rate": 5.63739924532924e-06, + "loss": 0.0015, + "step": 1607 + }, + { + "epoch": 1.4265129682997117, + "grad_norm": 0.037703752517700195, + "learning_rate": 5.632650461171225e-06, + "loss": 0.0005, + "step": 1608 + }, + { + "epoch": 1.4273996896475283, + "grad_norm": 0.10812222957611084, + "learning_rate": 5.6279010969823775e-06, + "loss": 0.0009, + "step": 1609 + }, + { + "epoch": 1.4282864109953448, + "grad_norm": 0.13056260347366333, + "learning_rate": 5.623151157117038e-06, + "loss": 0.0007, + "step": 1610 + }, + { + "epoch": 1.4291731323431611, + "grad_norm": 0.17778554558753967, + "learning_rate": 5.6184006459300845e-06, + "loss": 0.0006, + "step": 1611 + }, + { + "epoch": 1.4300598536909777, + "grad_norm": 0.024510536342859268, + "learning_rate": 5.613649567776905e-06, + "loss": 0.0002, + "step": 1612 + }, + { + "epoch": 1.430946575038794, + "grad_norm": 0.1636805534362793, + "learning_rate": 5.60889792701342e-06, + "loss": 0.0008, + "step": 1613 + }, + { + "epoch": 1.4318332963866105, + "grad_norm": 0.1205761656165123, + "learning_rate": 5.60414572799606e-06, + "loss": 0.0005, + "step": 1614 + }, + { + "epoch": 1.432720017734427, + "grad_norm": 0.04635613411664963, + "learning_rate": 5.599392975081766e-06, + "loss": 0.0002, + "step": 1615 + }, + { + "epoch": 1.4336067390822433, + "grad_norm": 0.3327246308326721, + "learning_rate": 5.594639672627991e-06, + "loss": 0.0019, + "step": 1616 + }, + { + "epoch": 1.4344934604300599, + "grad_norm": 0.3695579767227173, + "learning_rate": 5.589885824992687e-06, + "loss": 0.0035, + "step": 1617 + }, + { + "epoch": 1.4353801817778762, + "grad_norm": 0.18341560661792755, + "learning_rate": 5.585131436534312e-06, + "loss": 0.0013, + "step": 1618 + }, + { + "epoch": 1.4362669031256927, + "grad_norm": 0.10450714826583862, + "learning_rate": 5.580376511611813e-06, + "loss": 0.0008, + "step": 1619 + }, + { + "epoch": 1.4371536244735093, + "grad_norm": 0.29582592844963074, + "learning_rate": 5.575621054584633e-06, + "loss": 0.0049, + "step": 1620 + }, + { + "epoch": 1.4380403458213258, + "grad_norm": 0.05713328719139099, + "learning_rate": 5.570865069812703e-06, + "loss": 0.0003, + "step": 1621 + }, + { + "epoch": 1.438927067169142, + "grad_norm": 0.027085937559604645, + "learning_rate": 5.5661085616564344e-06, + "loss": 0.0002, + "step": 1622 + }, + { + "epoch": 1.4398137885169586, + "grad_norm": 0.33440035581588745, + "learning_rate": 5.561351534476726e-06, + "loss": 0.002, + "step": 1623 + }, + { + "epoch": 1.440700509864775, + "grad_norm": 0.37347376346588135, + "learning_rate": 5.55659399263494e-06, + "loss": 0.0012, + "step": 1624 + }, + { + "epoch": 1.4415872312125915, + "grad_norm": 0.05196278169751167, + "learning_rate": 5.551835940492924e-06, + "loss": 0.0004, + "step": 1625 + }, + { + "epoch": 1.442473952560408, + "grad_norm": 0.354519248008728, + "learning_rate": 5.547077382412985e-06, + "loss": 0.0037, + "step": 1626 + }, + { + "epoch": 1.4433606739082243, + "grad_norm": 0.321370393037796, + "learning_rate": 5.542318322757895e-06, + "loss": 0.0016, + "step": 1627 + }, + { + "epoch": 1.4442473952560408, + "grad_norm": 0.035019587725400925, + "learning_rate": 5.537558765890888e-06, + "loss": 0.0002, + "step": 1628 + }, + { + "epoch": 1.4451341166038572, + "grad_norm": 0.7996811270713806, + "learning_rate": 5.532798716175652e-06, + "loss": 0.002, + "step": 1629 + }, + { + "epoch": 1.4460208379516737, + "grad_norm": 0.07523198425769806, + "learning_rate": 5.52803817797633e-06, + "loss": 0.0004, + "step": 1630 + }, + { + "epoch": 1.4469075592994902, + "grad_norm": 0.08954586833715439, + "learning_rate": 5.523277155657507e-06, + "loss": 0.0005, + "step": 1631 + }, + { + "epoch": 1.4477942806473065, + "grad_norm": 0.010750265792012215, + "learning_rate": 5.51851565358422e-06, + "loss": 0.0001, + "step": 1632 + }, + { + "epoch": 1.448681001995123, + "grad_norm": 0.138992577791214, + "learning_rate": 5.513753676121936e-06, + "loss": 0.0007, + "step": 1633 + }, + { + "epoch": 1.4495677233429394, + "grad_norm": 0.3091992437839508, + "learning_rate": 5.508991227636568e-06, + "loss": 0.0036, + "step": 1634 + }, + { + "epoch": 1.450454444690756, + "grad_norm": 0.12356047332286835, + "learning_rate": 5.50422831249445e-06, + "loss": 0.0002, + "step": 1635 + }, + { + "epoch": 1.4513411660385724, + "grad_norm": 0.24620208144187927, + "learning_rate": 5.499464935062355e-06, + "loss": 0.0011, + "step": 1636 + }, + { + "epoch": 1.4522278873863887, + "grad_norm": 0.07541129738092422, + "learning_rate": 5.494701099707475e-06, + "loss": 0.0002, + "step": 1637 + }, + { + "epoch": 1.4531146087342053, + "grad_norm": 0.13778525590896606, + "learning_rate": 5.489936810797416e-06, + "loss": 0.0004, + "step": 1638 + }, + { + "epoch": 1.4540013300820216, + "grad_norm": 0.014556895941495895, + "learning_rate": 5.48517207270021e-06, + "loss": 0.0001, + "step": 1639 + }, + { + "epoch": 1.4548880514298381, + "grad_norm": 0.43046942353248596, + "learning_rate": 5.480406889784293e-06, + "loss": 0.0014, + "step": 1640 + }, + { + "epoch": 1.4557747727776547, + "grad_norm": 0.30873122811317444, + "learning_rate": 5.4756412664185166e-06, + "loss": 0.0057, + "step": 1641 + }, + { + "epoch": 1.4566614941254712, + "grad_norm": 0.07226885110139847, + "learning_rate": 5.470875206972126e-06, + "loss": 0.0002, + "step": 1642 + }, + { + "epoch": 1.4575482154732875, + "grad_norm": 0.4386361539363861, + "learning_rate": 5.466108715814775e-06, + "loss": 0.0013, + "step": 1643 + }, + { + "epoch": 1.458434936821104, + "grad_norm": 0.9503544569015503, + "learning_rate": 5.46134179731651e-06, + "loss": 0.0019, + "step": 1644 + }, + { + "epoch": 1.4593216581689203, + "grad_norm": 0.9516314268112183, + "learning_rate": 5.456574455847767e-06, + "loss": 0.0039, + "step": 1645 + }, + { + "epoch": 1.4602083795167369, + "grad_norm": 0.33284255862236023, + "learning_rate": 5.451806695779376e-06, + "loss": 0.0016, + "step": 1646 + }, + { + "epoch": 1.4610951008645534, + "grad_norm": 0.8509582281112671, + "learning_rate": 5.447038521482542e-06, + "loss": 0.0038, + "step": 1647 + }, + { + "epoch": 1.4619818222123697, + "grad_norm": 0.07178089022636414, + "learning_rate": 5.442269937328856e-06, + "loss": 0.0001, + "step": 1648 + }, + { + "epoch": 1.4628685435601863, + "grad_norm": 0.3110867440700531, + "learning_rate": 5.4375009476902855e-06, + "loss": 0.0022, + "step": 1649 + }, + { + "epoch": 1.4637552649080026, + "grad_norm": 0.058392178267240524, + "learning_rate": 5.432731556939163e-06, + "loss": 0.0003, + "step": 1650 + }, + { + "epoch": 1.464641986255819, + "grad_norm": 0.5997623205184937, + "learning_rate": 5.427961769448199e-06, + "loss": 0.0034, + "step": 1651 + }, + { + "epoch": 1.4655287076036356, + "grad_norm": 0.7097246646881104, + "learning_rate": 5.423191589590457e-06, + "loss": 0.0043, + "step": 1652 + }, + { + "epoch": 1.466415428951452, + "grad_norm": 0.7380861639976501, + "learning_rate": 5.418421021739369e-06, + "loss": 0.008, + "step": 1653 + }, + { + "epoch": 1.4673021502992685, + "grad_norm": 0.6223729252815247, + "learning_rate": 5.413650070268716e-06, + "loss": 0.0023, + "step": 1654 + }, + { + "epoch": 1.4681888716470848, + "grad_norm": 0.269790381193161, + "learning_rate": 5.4088787395526365e-06, + "loss": 0.001, + "step": 1655 + }, + { + "epoch": 1.4690755929949013, + "grad_norm": 0.8038837313652039, + "learning_rate": 5.40410703396561e-06, + "loss": 0.0044, + "step": 1656 + }, + { + "epoch": 1.4699623143427178, + "grad_norm": 0.15287820994853973, + "learning_rate": 5.3993349578824675e-06, + "loss": 0.0008, + "step": 1657 + }, + { + "epoch": 1.4708490356905344, + "grad_norm": 0.4003780484199524, + "learning_rate": 5.394562515678374e-06, + "loss": 0.0023, + "step": 1658 + }, + { + "epoch": 1.4717357570383507, + "grad_norm": 0.04636048153042793, + "learning_rate": 5.389789711728832e-06, + "loss": 0.0002, + "step": 1659 + }, + { + "epoch": 1.4726224783861672, + "grad_norm": 0.23046471178531647, + "learning_rate": 5.385016550409676e-06, + "loss": 0.0011, + "step": 1660 + }, + { + "epoch": 1.4735091997339835, + "grad_norm": 0.5971014499664307, + "learning_rate": 5.380243036097067e-06, + "loss": 0.002, + "step": 1661 + }, + { + "epoch": 1.4743959210818, + "grad_norm": 0.6471933126449585, + "learning_rate": 5.375469173167491e-06, + "loss": 0.005, + "step": 1662 + }, + { + "epoch": 1.4752826424296166, + "grad_norm": 0.03686193749308586, + "learning_rate": 5.370694965997753e-06, + "loss": 0.0002, + "step": 1663 + }, + { + "epoch": 1.476169363777433, + "grad_norm": 0.24532069265842438, + "learning_rate": 5.365920418964973e-06, + "loss": 0.0019, + "step": 1664 + }, + { + "epoch": 1.4770560851252494, + "grad_norm": 0.8025774359703064, + "learning_rate": 5.361145536446582e-06, + "loss": 0.0054, + "step": 1665 + }, + { + "epoch": 1.4779428064730658, + "grad_norm": 0.2576480209827423, + "learning_rate": 5.356370322820325e-06, + "loss": 0.0014, + "step": 1666 + }, + { + "epoch": 1.4788295278208823, + "grad_norm": 1.0776647329330444, + "learning_rate": 5.351594782464239e-06, + "loss": 0.0042, + "step": 1667 + }, + { + "epoch": 1.4797162491686988, + "grad_norm": 0.6167517304420471, + "learning_rate": 5.34681891975667e-06, + "loss": 0.0015, + "step": 1668 + }, + { + "epoch": 1.4806029705165151, + "grad_norm": 0.6090970635414124, + "learning_rate": 5.342042739076257e-06, + "loss": 0.0022, + "step": 1669 + }, + { + "epoch": 1.4814896918643317, + "grad_norm": 0.044063255190849304, + "learning_rate": 5.337266244801927e-06, + "loss": 0.0002, + "step": 1670 + }, + { + "epoch": 1.482376413212148, + "grad_norm": 0.03505855053663254, + "learning_rate": 5.332489441312901e-06, + "loss": 0.0002, + "step": 1671 + }, + { + "epoch": 1.4832631345599645, + "grad_norm": 0.5114758014678955, + "learning_rate": 5.327712332988678e-06, + "loss": 0.0024, + "step": 1672 + }, + { + "epoch": 1.484149855907781, + "grad_norm": 0.08527572453022003, + "learning_rate": 5.322934924209039e-06, + "loss": 0.0005, + "step": 1673 + }, + { + "epoch": 1.4850365772555973, + "grad_norm": 0.12833604216575623, + "learning_rate": 5.318157219354041e-06, + "loss": 0.0004, + "step": 1674 + }, + { + "epoch": 1.4859232986034139, + "grad_norm": 0.045691631734371185, + "learning_rate": 5.313379222804008e-06, + "loss": 0.0004, + "step": 1675 + }, + { + "epoch": 1.4868100199512302, + "grad_norm": 0.2374916523694992, + "learning_rate": 5.308600938939538e-06, + "loss": 0.0008, + "step": 1676 + }, + { + "epoch": 1.4876967412990467, + "grad_norm": 0.047486886382102966, + "learning_rate": 5.303822372141489e-06, + "loss": 0.0003, + "step": 1677 + }, + { + "epoch": 1.4885834626468633, + "grad_norm": 0.08337344229221344, + "learning_rate": 5.2990435267909764e-06, + "loss": 0.0005, + "step": 1678 + }, + { + "epoch": 1.4894701839946798, + "grad_norm": 0.037117332220077515, + "learning_rate": 5.294264407269376e-06, + "loss": 0.0002, + "step": 1679 + }, + { + "epoch": 1.490356905342496, + "grad_norm": 1.1246777772903442, + "learning_rate": 5.2894850179583095e-06, + "loss": 0.0053, + "step": 1680 + }, + { + "epoch": 1.4912436266903126, + "grad_norm": 0.23861926794052124, + "learning_rate": 5.284705363239651e-06, + "loss": 0.0007, + "step": 1681 + }, + { + "epoch": 1.492130348038129, + "grad_norm": 0.34302806854248047, + "learning_rate": 5.279925447495513e-06, + "loss": 0.0018, + "step": 1682 + }, + { + "epoch": 1.4930170693859455, + "grad_norm": 0.00627083471044898, + "learning_rate": 5.275145275108252e-06, + "loss": 0.0, + "step": 1683 + }, + { + "epoch": 1.493903790733762, + "grad_norm": 0.0319259911775589, + "learning_rate": 5.270364850460456e-06, + "loss": 0.0001, + "step": 1684 + }, + { + "epoch": 1.4947905120815783, + "grad_norm": 0.26929083466529846, + "learning_rate": 5.26558417793495e-06, + "loss": 0.0014, + "step": 1685 + }, + { + "epoch": 1.4956772334293948, + "grad_norm": 0.4781963527202606, + "learning_rate": 5.2608032619147765e-06, + "loss": 0.0027, + "step": 1686 + }, + { + "epoch": 1.4965639547772112, + "grad_norm": 0.6385494470596313, + "learning_rate": 5.2560221067832095e-06, + "loss": 0.0107, + "step": 1687 + }, + { + "epoch": 1.4974506761250277, + "grad_norm": 0.07988189905881882, + "learning_rate": 5.251240716923741e-06, + "loss": 0.0004, + "step": 1688 + }, + { + "epoch": 1.4983373974728442, + "grad_norm": 0.02995728887617588, + "learning_rate": 5.246459096720075e-06, + "loss": 0.0002, + "step": 1689 + }, + { + "epoch": 1.4992241188206605, + "grad_norm": 0.2245960831642151, + "learning_rate": 5.2416772505561275e-06, + "loss": 0.0008, + "step": 1690 + }, + { + "epoch": 1.500110840168477, + "grad_norm": 0.3143225908279419, + "learning_rate": 5.236895182816027e-06, + "loss": 0.0021, + "step": 1691 + }, + { + "epoch": 1.5009975615162934, + "grad_norm": 0.03064090944826603, + "learning_rate": 5.232112897884097e-06, + "loss": 0.0001, + "step": 1692 + }, + { + "epoch": 1.5009975615162934, + "eval_loss": 0.01623193733394146, + "eval_runtime": 61.9901, + "eval_samples_per_second": 3.097, + "eval_steps_per_second": 0.774, + "step": 1692 + }, + { + "epoch": 1.50188428286411, + "grad_norm": 0.5175110101699829, + "learning_rate": 5.2273304001448656e-06, + "loss": 0.0029, + "step": 1693 + }, + { + "epoch": 1.5027710042119264, + "grad_norm": 0.60247802734375, + "learning_rate": 5.222547693983053e-06, + "loss": 0.0043, + "step": 1694 + }, + { + "epoch": 1.503657725559743, + "grad_norm": 0.5946943759918213, + "learning_rate": 5.217764783783574e-06, + "loss": 0.0049, + "step": 1695 + }, + { + "epoch": 1.5045444469075593, + "grad_norm": 0.13281765580177307, + "learning_rate": 5.212981673931528e-06, + "loss": 0.0005, + "step": 1696 + }, + { + "epoch": 1.5054311682553756, + "grad_norm": 0.4556334316730499, + "learning_rate": 5.208198368812197e-06, + "loss": 0.0031, + "step": 1697 + }, + { + "epoch": 1.5063178896031921, + "grad_norm": 0.5918658971786499, + "learning_rate": 5.203414872811042e-06, + "loss": 0.0014, + "step": 1698 + }, + { + "epoch": 1.5072046109510087, + "grad_norm": 0.30193421244621277, + "learning_rate": 5.198631190313702e-06, + "loss": 0.0006, + "step": 1699 + }, + { + "epoch": 1.5080913322988252, + "grad_norm": 0.3599817752838135, + "learning_rate": 5.193847325705983e-06, + "loss": 0.0066, + "step": 1700 + }, + { + "epoch": 1.5089780536466415, + "grad_norm": 0.35227012634277344, + "learning_rate": 5.189063283373861e-06, + "loss": 0.002, + "step": 1701 + }, + { + "epoch": 1.509864774994458, + "grad_norm": 0.35682663321495056, + "learning_rate": 5.184279067703473e-06, + "loss": 0.0015, + "step": 1702 + }, + { + "epoch": 1.5107514963422743, + "grad_norm": 0.03233305737376213, + "learning_rate": 5.179494683081116e-06, + "loss": 0.0002, + "step": 1703 + }, + { + "epoch": 1.5116382176900909, + "grad_norm": 0.15405134856700897, + "learning_rate": 5.17471013389324e-06, + "loss": 0.0011, + "step": 1704 + }, + { + "epoch": 1.5125249390379074, + "grad_norm": 0.4507070481777191, + "learning_rate": 5.16992542452645e-06, + "loss": 0.0033, + "step": 1705 + }, + { + "epoch": 1.513411660385724, + "grad_norm": 0.4773896634578705, + "learning_rate": 5.165140559367496e-06, + "loss": 0.0038, + "step": 1706 + }, + { + "epoch": 1.5142983817335403, + "grad_norm": 0.19090554118156433, + "learning_rate": 5.160355542803265e-06, + "loss": 0.0006, + "step": 1707 + }, + { + "epoch": 1.5151851030813566, + "grad_norm": 0.07626492530107498, + "learning_rate": 5.155570379220792e-06, + "loss": 0.0007, + "step": 1708 + }, + { + "epoch": 1.516071824429173, + "grad_norm": 0.20484142005443573, + "learning_rate": 5.150785073007243e-06, + "loss": 0.0018, + "step": 1709 + }, + { + "epoch": 1.5169585457769896, + "grad_norm": 0.1442720890045166, + "learning_rate": 5.145999628549913e-06, + "loss": 0.0005, + "step": 1710 + }, + { + "epoch": 1.5178452671248062, + "grad_norm": 0.24547992646694183, + "learning_rate": 5.141214050236225e-06, + "loss": 0.0012, + "step": 1711 + }, + { + "epoch": 1.5187319884726225, + "grad_norm": 0.10670553892850876, + "learning_rate": 5.136428342453726e-06, + "loss": 0.0008, + "step": 1712 + }, + { + "epoch": 1.5196187098204388, + "grad_norm": 0.4111662209033966, + "learning_rate": 5.131642509590081e-06, + "loss": 0.0069, + "step": 1713 + }, + { + "epoch": 1.5205054311682553, + "grad_norm": 0.13540194928646088, + "learning_rate": 5.1268565560330685e-06, + "loss": 0.0008, + "step": 1714 + }, + { + "epoch": 1.5213921525160718, + "grad_norm": 0.30105480551719666, + "learning_rate": 5.1220704861705775e-06, + "loss": 0.0019, + "step": 1715 + }, + { + "epoch": 1.5222788738638884, + "grad_norm": 0.11186516284942627, + "learning_rate": 5.117284304390606e-06, + "loss": 0.0006, + "step": 1716 + }, + { + "epoch": 1.5231655952117047, + "grad_norm": 0.44437602162361145, + "learning_rate": 5.112498015081254e-06, + "loss": 0.0028, + "step": 1717 + }, + { + "epoch": 1.5240523165595212, + "grad_norm": 0.1276712268590927, + "learning_rate": 5.107711622630716e-06, + "loss": 0.0008, + "step": 1718 + }, + { + "epoch": 1.5249390379073375, + "grad_norm": 0.9235378503799438, + "learning_rate": 5.102925131427289e-06, + "loss": 0.0036, + "step": 1719 + }, + { + "epoch": 1.525825759255154, + "grad_norm": 0.08117254823446274, + "learning_rate": 5.098138545859354e-06, + "loss": 0.0005, + "step": 1720 + }, + { + "epoch": 1.5267124806029706, + "grad_norm": 0.3608892858028412, + "learning_rate": 5.093351870315379e-06, + "loss": 0.0008, + "step": 1721 + }, + { + "epoch": 1.527599201950787, + "grad_norm": 0.3133014440536499, + "learning_rate": 5.088565109183917e-06, + "loss": 0.0014, + "step": 1722 + }, + { + "epoch": 1.5284859232986034, + "grad_norm": 0.5285730361938477, + "learning_rate": 5.083778266853598e-06, + "loss": 0.0017, + "step": 1723 + }, + { + "epoch": 1.5293726446464198, + "grad_norm": 0.4431508779525757, + "learning_rate": 5.078991347713127e-06, + "loss": 0.0073, + "step": 1724 + }, + { + "epoch": 1.5302593659942363, + "grad_norm": 0.27122896909713745, + "learning_rate": 5.074204356151281e-06, + "loss": 0.0044, + "step": 1725 + }, + { + "epoch": 1.5311460873420528, + "grad_norm": 0.14935316145420074, + "learning_rate": 5.069417296556899e-06, + "loss": 0.0011, + "step": 1726 + }, + { + "epoch": 1.5320328086898694, + "grad_norm": 0.14171618223190308, + "learning_rate": 5.064630173318887e-06, + "loss": 0.0005, + "step": 1727 + }, + { + "epoch": 1.5329195300376857, + "grad_norm": 0.13760340213775635, + "learning_rate": 5.059842990826206e-06, + "loss": 0.0011, + "step": 1728 + }, + { + "epoch": 1.533806251385502, + "grad_norm": 0.35050904750823975, + "learning_rate": 5.055055753467875e-06, + "loss": 0.0028, + "step": 1729 + }, + { + "epoch": 1.5346929727333185, + "grad_norm": 0.4986531138420105, + "learning_rate": 5.050268465632957e-06, + "loss": 0.0044, + "step": 1730 + }, + { + "epoch": 1.535579694081135, + "grad_norm": 0.020403970032930374, + "learning_rate": 5.045481131710568e-06, + "loss": 0.0002, + "step": 1731 + }, + { + "epoch": 1.5364664154289516, + "grad_norm": 0.6366745829582214, + "learning_rate": 5.040693756089865e-06, + "loss": 0.0076, + "step": 1732 + }, + { + "epoch": 1.5373531367767679, + "grad_norm": 0.2623487114906311, + "learning_rate": 5.035906343160038e-06, + "loss": 0.0017, + "step": 1733 + }, + { + "epoch": 1.5382398581245842, + "grad_norm": 0.19334276020526886, + "learning_rate": 5.031118897310318e-06, + "loss": 0.0039, + "step": 1734 + }, + { + "epoch": 1.5391265794724007, + "grad_norm": 0.5103495121002197, + "learning_rate": 5.026331422929962e-06, + "loss": 0.0043, + "step": 1735 + }, + { + "epoch": 1.5400133008202173, + "grad_norm": 0.03877022862434387, + "learning_rate": 5.021543924408254e-06, + "loss": 0.0003, + "step": 1736 + }, + { + "epoch": 1.5409000221680338, + "grad_norm": 0.2172171175479889, + "learning_rate": 5.016756406134501e-06, + "loss": 0.0049, + "step": 1737 + }, + { + "epoch": 1.54178674351585, + "grad_norm": 0.1811998188495636, + "learning_rate": 5.011968872498028e-06, + "loss": 0.0013, + "step": 1738 + }, + { + "epoch": 1.5426734648636666, + "grad_norm": 0.45922988653182983, + "learning_rate": 5.007181327888173e-06, + "loss": 0.0036, + "step": 1739 + }, + { + "epoch": 1.543560186211483, + "grad_norm": 0.04768901318311691, + "learning_rate": 5.002393776694284e-06, + "loss": 0.0003, + "step": 1740 + }, + { + "epoch": 1.5444469075592995, + "grad_norm": 0.011605698615312576, + "learning_rate": 4.997606223305717e-06, + "loss": 0.0001, + "step": 1741 + }, + { + "epoch": 1.545333628907116, + "grad_norm": 0.08056286722421646, + "learning_rate": 4.9928186721118285e-06, + "loss": 0.0006, + "step": 1742 + }, + { + "epoch": 1.5462203502549325, + "grad_norm": 0.05473679304122925, + "learning_rate": 4.988031127501974e-06, + "loss": 0.0003, + "step": 1743 + }, + { + "epoch": 1.5471070716027489, + "grad_norm": 0.11067911982536316, + "learning_rate": 4.983243593865501e-06, + "loss": 0.0007, + "step": 1744 + }, + { + "epoch": 1.5479937929505652, + "grad_norm": 0.19086086750030518, + "learning_rate": 4.978456075591747e-06, + "loss": 0.0027, + "step": 1745 + }, + { + "epoch": 1.5488805142983817, + "grad_norm": 0.3276596963405609, + "learning_rate": 4.97366857707004e-06, + "loss": 0.0067, + "step": 1746 + }, + { + "epoch": 1.5497672356461982, + "grad_norm": 0.16259552538394928, + "learning_rate": 4.968881102689684e-06, + "loss": 0.0009, + "step": 1747 + }, + { + "epoch": 1.5506539569940148, + "grad_norm": 0.14583039283752441, + "learning_rate": 4.964093656839963e-06, + "loss": 0.001, + "step": 1748 + }, + { + "epoch": 1.551540678341831, + "grad_norm": 0.0421629436314106, + "learning_rate": 4.959306243910137e-06, + "loss": 0.0004, + "step": 1749 + }, + { + "epoch": 1.5524273996896474, + "grad_norm": 0.29653817415237427, + "learning_rate": 4.954518868289434e-06, + "loss": 0.0028, + "step": 1750 + }, + { + "epoch": 1.553314121037464, + "grad_norm": 0.13450933992862701, + "learning_rate": 4.949731534367045e-06, + "loss": 0.0008, + "step": 1751 + }, + { + "epoch": 1.5542008423852804, + "grad_norm": 0.20457084476947784, + "learning_rate": 4.944944246532127e-06, + "loss": 0.0028, + "step": 1752 + }, + { + "epoch": 1.555087563733097, + "grad_norm": 0.598363995552063, + "learning_rate": 4.940157009173795e-06, + "loss": 0.0066, + "step": 1753 + }, + { + "epoch": 1.5559742850809133, + "grad_norm": 0.04115546494722366, + "learning_rate": 4.935369826681115e-06, + "loss": 0.0004, + "step": 1754 + }, + { + "epoch": 1.5568610064287298, + "grad_norm": 0.027022339403629303, + "learning_rate": 4.930582703443101e-06, + "loss": 0.0002, + "step": 1755 + }, + { + "epoch": 1.5577477277765461, + "grad_norm": 0.2436944842338562, + "learning_rate": 4.92579564384872e-06, + "loss": 0.0013, + "step": 1756 + }, + { + "epoch": 1.5586344491243627, + "grad_norm": 0.1759563386440277, + "learning_rate": 4.9210086522868736e-06, + "loss": 0.0014, + "step": 1757 + }, + { + "epoch": 1.5595211704721792, + "grad_norm": 0.0894976332783699, + "learning_rate": 4.916221733146405e-06, + "loss": 0.0003, + "step": 1758 + }, + { + "epoch": 1.5604078918199957, + "grad_norm": 0.09848005324602127, + "learning_rate": 4.911434890816084e-06, + "loss": 0.0006, + "step": 1759 + }, + { + "epoch": 1.561294613167812, + "grad_norm": 0.05898863077163696, + "learning_rate": 4.906648129684623e-06, + "loss": 0.0003, + "step": 1760 + }, + { + "epoch": 1.5621813345156284, + "grad_norm": 0.46867799758911133, + "learning_rate": 4.901861454140649e-06, + "loss": 0.007, + "step": 1761 + }, + { + "epoch": 1.5630680558634449, + "grad_norm": 0.30307456851005554, + "learning_rate": 4.897074868572711e-06, + "loss": 0.0012, + "step": 1762 + }, + { + "epoch": 1.5639547772112614, + "grad_norm": 0.2749338746070862, + "learning_rate": 4.892288377369285e-06, + "loss": 0.0015, + "step": 1763 + }, + { + "epoch": 1.564841498559078, + "grad_norm": 0.20320548117160797, + "learning_rate": 4.887501984918748e-06, + "loss": 0.0007, + "step": 1764 + }, + { + "epoch": 1.5657282199068943, + "grad_norm": 0.14280276000499725, + "learning_rate": 4.882715695609396e-06, + "loss": 0.0008, + "step": 1765 + }, + { + "epoch": 1.5666149412547106, + "grad_norm": 0.05326784774661064, + "learning_rate": 4.877929513829424e-06, + "loss": 0.0003, + "step": 1766 + }, + { + "epoch": 1.567501662602527, + "grad_norm": 0.4620476961135864, + "learning_rate": 4.873143443966933e-06, + "loss": 0.0022, + "step": 1767 + }, + { + "epoch": 1.5683883839503436, + "grad_norm": 0.8573052287101746, + "learning_rate": 4.868357490409921e-06, + "loss": 0.0051, + "step": 1768 + }, + { + "epoch": 1.5692751052981602, + "grad_norm": 0.24559076130390167, + "learning_rate": 4.863571657546275e-06, + "loss": 0.0008, + "step": 1769 + }, + { + "epoch": 1.5701618266459765, + "grad_norm": 0.09770537912845612, + "learning_rate": 4.858785949763776e-06, + "loss": 0.0004, + "step": 1770 + }, + { + "epoch": 1.5710485479937928, + "grad_norm": 0.06460510939359665, + "learning_rate": 4.854000371450089e-06, + "loss": 0.0002, + "step": 1771 + }, + { + "epoch": 1.5719352693416093, + "grad_norm": 0.08998130261898041, + "learning_rate": 4.849214926992759e-06, + "loss": 0.0005, + "step": 1772 + }, + { + "epoch": 1.5728219906894259, + "grad_norm": 0.215066060423851, + "learning_rate": 4.8444296207792085e-06, + "loss": 0.0008, + "step": 1773 + }, + { + "epoch": 1.5737087120372424, + "grad_norm": 0.11160604655742645, + "learning_rate": 4.839644457196737e-06, + "loss": 0.0007, + "step": 1774 + }, + { + "epoch": 1.5745954333850587, + "grad_norm": 0.24873779714107513, + "learning_rate": 4.834859440632508e-06, + "loss": 0.0004, + "step": 1775 + }, + { + "epoch": 1.5754821547328752, + "grad_norm": 0.6680960655212402, + "learning_rate": 4.83007457547355e-06, + "loss": 0.0077, + "step": 1776 + }, + { + "epoch": 1.5763688760806915, + "grad_norm": 0.024391671642661095, + "learning_rate": 4.8252898661067605e-06, + "loss": 0.0002, + "step": 1777 + }, + { + "epoch": 1.577255597428508, + "grad_norm": 0.4404471814632416, + "learning_rate": 4.820505316918887e-06, + "loss": 0.0059, + "step": 1778 + }, + { + "epoch": 1.5781423187763246, + "grad_norm": 0.3777647316455841, + "learning_rate": 4.81572093229653e-06, + "loss": 0.0058, + "step": 1779 + }, + { + "epoch": 1.5790290401241411, + "grad_norm": 0.3575120270252228, + "learning_rate": 4.81093671662614e-06, + "loss": 0.0014, + "step": 1780 + }, + { + "epoch": 1.5799157614719574, + "grad_norm": 0.1214371770620346, + "learning_rate": 4.8061526742940186e-06, + "loss": 0.0006, + "step": 1781 + }, + { + "epoch": 1.5808024828197738, + "grad_norm": 0.42392250895500183, + "learning_rate": 4.8013688096863e-06, + "loss": 0.0013, + "step": 1782 + }, + { + "epoch": 1.5816892041675903, + "grad_norm": 0.13642321527004242, + "learning_rate": 4.796585127188958e-06, + "loss": 0.0007, + "step": 1783 + }, + { + "epoch": 1.5825759255154068, + "grad_norm": 0.11742696911096573, + "learning_rate": 4.791801631187804e-06, + "loss": 0.0006, + "step": 1784 + }, + { + "epoch": 1.5834626468632234, + "grad_norm": 0.03347081318497658, + "learning_rate": 4.787018326068474e-06, + "loss": 0.0002, + "step": 1785 + }, + { + "epoch": 1.5843493682110397, + "grad_norm": 0.3243880569934845, + "learning_rate": 4.7822352162164275e-06, + "loss": 0.0026, + "step": 1786 + }, + { + "epoch": 1.585236089558856, + "grad_norm": 0.4610430896282196, + "learning_rate": 4.777452306016947e-06, + "loss": 0.0033, + "step": 1787 + }, + { + "epoch": 1.5861228109066725, + "grad_norm": 0.9628936052322388, + "learning_rate": 4.772669599855136e-06, + "loss": 0.011, + "step": 1788 + }, + { + "epoch": 1.587009532254489, + "grad_norm": 0.1590164303779602, + "learning_rate": 4.767887102115904e-06, + "loss": 0.0006, + "step": 1789 + }, + { + "epoch": 1.5878962536023056, + "grad_norm": 0.07788955420255661, + "learning_rate": 4.763104817183974e-06, + "loss": 0.0004, + "step": 1790 + }, + { + "epoch": 1.5887829749501219, + "grad_norm": 0.05542571097612381, + "learning_rate": 4.758322749443874e-06, + "loss": 0.0004, + "step": 1791 + }, + { + "epoch": 1.5896696962979384, + "grad_norm": 0.33823779225349426, + "learning_rate": 4.753540903279928e-06, + "loss": 0.0019, + "step": 1792 + }, + { + "epoch": 1.5905564176457547, + "grad_norm": 0.07747834920883179, + "learning_rate": 4.748759283076262e-06, + "loss": 0.0005, + "step": 1793 + }, + { + "epoch": 1.5914431389935713, + "grad_norm": 0.17952410876750946, + "learning_rate": 4.743977893216792e-06, + "loss": 0.0017, + "step": 1794 + }, + { + "epoch": 1.5923298603413878, + "grad_norm": 0.015386722050607204, + "learning_rate": 4.739196738085225e-06, + "loss": 0.0001, + "step": 1795 + }, + { + "epoch": 1.5932165816892043, + "grad_norm": 0.1653813123703003, + "learning_rate": 4.734415822065053e-06, + "loss": 0.0008, + "step": 1796 + }, + { + "epoch": 1.5941033030370206, + "grad_norm": 0.055652689188718796, + "learning_rate": 4.729635149539543e-06, + "loss": 0.0004, + "step": 1797 + }, + { + "epoch": 1.594990024384837, + "grad_norm": 0.09155047684907913, + "learning_rate": 4.7248547248917495e-06, + "loss": 0.001, + "step": 1798 + }, + { + "epoch": 1.5958767457326535, + "grad_norm": 0.4243689179420471, + "learning_rate": 4.7200745525044885e-06, + "loss": 0.0072, + "step": 1799 + }, + { + "epoch": 1.59676346708047, + "grad_norm": 0.2508626878261566, + "learning_rate": 4.715294636760352e-06, + "loss": 0.003, + "step": 1800 + }, + { + "epoch": 1.5976501884282865, + "grad_norm": 0.3041090965270996, + "learning_rate": 4.710514982041691e-06, + "loss": 0.0092, + "step": 1801 + }, + { + "epoch": 1.5985369097761029, + "grad_norm": 0.45713216066360474, + "learning_rate": 4.705735592730626e-06, + "loss": 0.0018, + "step": 1802 + }, + { + "epoch": 1.5994236311239192, + "grad_norm": 0.10100582242012024, + "learning_rate": 4.700956473209025e-06, + "loss": 0.001, + "step": 1803 + }, + { + "epoch": 1.6003103524717357, + "grad_norm": 0.0519060418009758, + "learning_rate": 4.696177627858511e-06, + "loss": 0.0003, + "step": 1804 + }, + { + "epoch": 1.6011970738195522, + "grad_norm": 1.5515557527542114, + "learning_rate": 4.6913990610604625e-06, + "loss": 0.0047, + "step": 1805 + }, + { + "epoch": 1.6020837951673688, + "grad_norm": 0.26683878898620605, + "learning_rate": 4.686620777195993e-06, + "loss": 0.0023, + "step": 1806 + }, + { + "epoch": 1.602970516515185, + "grad_norm": 0.23357915878295898, + "learning_rate": 4.681842780645961e-06, + "loss": 0.0015, + "step": 1807 + }, + { + "epoch": 1.6038572378630016, + "grad_norm": 0.40496546030044556, + "learning_rate": 4.677065075790961e-06, + "loss": 0.0038, + "step": 1808 + }, + { + "epoch": 1.604743959210818, + "grad_norm": 0.13487201929092407, + "learning_rate": 4.672287667011323e-06, + "loss": 0.0006, + "step": 1809 + }, + { + "epoch": 1.6056306805586344, + "grad_norm": 0.2809060513973236, + "learning_rate": 4.667510558687101e-06, + "loss": 0.0019, + "step": 1810 + }, + { + "epoch": 1.606517401906451, + "grad_norm": 0.08889652043581009, + "learning_rate": 4.662733755198073e-06, + "loss": 0.0007, + "step": 1811 + }, + { + "epoch": 1.6074041232542673, + "grad_norm": 0.25819990038871765, + "learning_rate": 4.657957260923745e-06, + "loss": 0.0037, + "step": 1812 + }, + { + "epoch": 1.6082908446020838, + "grad_norm": 0.09379664063453674, + "learning_rate": 4.653181080243332e-06, + "loss": 0.0009, + "step": 1813 + }, + { + "epoch": 1.6091775659499001, + "grad_norm": 0.06153678521513939, + "learning_rate": 4.648405217535763e-06, + "loss": 0.0005, + "step": 1814 + }, + { + "epoch": 1.6100642872977167, + "grad_norm": 0.1556420475244522, + "learning_rate": 4.643629677179677e-06, + "loss": 0.0022, + "step": 1815 + }, + { + "epoch": 1.6109510086455332, + "grad_norm": 0.29694435000419617, + "learning_rate": 4.6388544635534186e-06, + "loss": 0.002, + "step": 1816 + }, + { + "epoch": 1.6118377299933497, + "grad_norm": 0.09254162758588791, + "learning_rate": 4.634079581035029e-06, + "loss": 0.0008, + "step": 1817 + }, + { + "epoch": 1.612724451341166, + "grad_norm": 0.039080262184143066, + "learning_rate": 4.629305034002249e-06, + "loss": 0.0004, + "step": 1818 + }, + { + "epoch": 1.6136111726889824, + "grad_norm": 0.334262490272522, + "learning_rate": 4.62453082683251e-06, + "loss": 0.0021, + "step": 1819 + }, + { + "epoch": 1.6144978940367989, + "grad_norm": 0.07444017380475998, + "learning_rate": 4.619756963902935e-06, + "loss": 0.0004, + "step": 1820 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 0.4941510558128357, + "learning_rate": 4.614983449590326e-06, + "loss": 0.0143, + "step": 1821 + }, + { + "epoch": 1.616271336732432, + "grad_norm": 0.11141020804643631, + "learning_rate": 4.610210288271169e-06, + "loss": 0.0007, + "step": 1822 + }, + { + "epoch": 1.6171580580802483, + "grad_norm": 0.05057799816131592, + "learning_rate": 4.605437484321627e-06, + "loss": 0.0004, + "step": 1823 + }, + { + "epoch": 1.6180447794280646, + "grad_norm": 0.5352963209152222, + "learning_rate": 4.600665042117534e-06, + "loss": 0.0037, + "step": 1824 + }, + { + "epoch": 1.618931500775881, + "grad_norm": 0.4706566035747528, + "learning_rate": 4.59589296603439e-06, + "loss": 0.0059, + "step": 1825 + }, + { + "epoch": 1.6198182221236976, + "grad_norm": 0.09059499949216843, + "learning_rate": 4.591121260447365e-06, + "loss": 0.0006, + "step": 1826 + }, + { + "epoch": 1.6207049434715142, + "grad_norm": 0.13940253853797913, + "learning_rate": 4.586349929731285e-06, + "loss": 0.0014, + "step": 1827 + }, + { + "epoch": 1.6215916648193305, + "grad_norm": 0.11622869223356247, + "learning_rate": 4.581578978260634e-06, + "loss": 0.001, + "step": 1828 + }, + { + "epoch": 1.622478386167147, + "grad_norm": 0.20990999042987823, + "learning_rate": 4.5768084104095436e-06, + "loss": 0.0013, + "step": 1829 + }, + { + "epoch": 1.6233651075149633, + "grad_norm": 0.586199939250946, + "learning_rate": 4.5720382305518026e-06, + "loss": 0.004, + "step": 1830 + }, + { + "epoch": 1.6242518288627799, + "grad_norm": 0.2799239456653595, + "learning_rate": 4.5672684430608384e-06, + "loss": 0.0067, + "step": 1831 + }, + { + "epoch": 1.6251385502105964, + "grad_norm": 0.43799954652786255, + "learning_rate": 4.562499052309715e-06, + "loss": 0.0032, + "step": 1832 + }, + { + "epoch": 1.626025271558413, + "grad_norm": 0.02102653495967388, + "learning_rate": 4.5577300626711444e-06, + "loss": 0.0001, + "step": 1833 + }, + { + "epoch": 1.6269119929062292, + "grad_norm": 0.051001351326704025, + "learning_rate": 4.5529614785174606e-06, + "loss": 0.0004, + "step": 1834 + }, + { + "epoch": 1.6277987142540455, + "grad_norm": 0.23884452879428864, + "learning_rate": 4.548193304220627e-06, + "loss": 0.0014, + "step": 1835 + }, + { + "epoch": 1.628685435601862, + "grad_norm": 0.5026673674583435, + "learning_rate": 4.543425544152232e-06, + "loss": 0.0052, + "step": 1836 + }, + { + "epoch": 1.6295721569496786, + "grad_norm": 0.10448399931192398, + "learning_rate": 4.53865820268349e-06, + "loss": 0.0005, + "step": 1837 + }, + { + "epoch": 1.6304588782974951, + "grad_norm": 0.5924953818321228, + "learning_rate": 4.533891284185226e-06, + "loss": 0.0023, + "step": 1838 + }, + { + "epoch": 1.6313455996453115, + "grad_norm": 0.035826656967401505, + "learning_rate": 4.529124793027873e-06, + "loss": 0.0003, + "step": 1839 + }, + { + "epoch": 1.6322323209931278, + "grad_norm": 0.30904772877693176, + "learning_rate": 4.524358733581485e-06, + "loss": 0.0036, + "step": 1840 + }, + { + "epoch": 1.6331190423409443, + "grad_norm": 0.2722933888435364, + "learning_rate": 4.519593110215708e-06, + "loss": 0.0018, + "step": 1841 + }, + { + "epoch": 1.6340057636887608, + "grad_norm": 0.5684006214141846, + "learning_rate": 4.514827927299792e-06, + "loss": 0.0101, + "step": 1842 + }, + { + "epoch": 1.6348924850365774, + "grad_norm": 0.6988949775695801, + "learning_rate": 4.510063189202585e-06, + "loss": 0.0019, + "step": 1843 + }, + { + "epoch": 1.6357792063843937, + "grad_norm": 0.1531183421611786, + "learning_rate": 4.505298900292528e-06, + "loss": 0.001, + "step": 1844 + }, + { + "epoch": 1.6366659277322102, + "grad_norm": 0.4703754484653473, + "learning_rate": 4.5005350649376465e-06, + "loss": 0.0049, + "step": 1845 + }, + { + "epoch": 1.6375526490800265, + "grad_norm": 0.3406502902507782, + "learning_rate": 4.4957716875055505e-06, + "loss": 0.0036, + "step": 1846 + }, + { + "epoch": 1.638439370427843, + "grad_norm": 0.5264698266983032, + "learning_rate": 4.491008772363435e-06, + "loss": 0.0012, + "step": 1847 + }, + { + "epoch": 1.6393260917756596, + "grad_norm": 0.13831202685832977, + "learning_rate": 4.486246323878065e-06, + "loss": 0.0016, + "step": 1848 + }, + { + "epoch": 1.640212813123476, + "grad_norm": 0.06777837127447128, + "learning_rate": 4.481484346415783e-06, + "loss": 0.0006, + "step": 1849 + }, + { + "epoch": 1.6410995344712924, + "grad_norm": 0.13357211649417877, + "learning_rate": 4.476722844342493e-06, + "loss": 0.0006, + "step": 1850 + }, + { + "epoch": 1.6419862558191087, + "grad_norm": 0.3240200877189636, + "learning_rate": 4.4719618220236715e-06, + "loss": 0.0051, + "step": 1851 + }, + { + "epoch": 1.6428729771669253, + "grad_norm": 0.08127690851688385, + "learning_rate": 4.467201283824349e-06, + "loss": 0.0006, + "step": 1852 + }, + { + "epoch": 1.6437596985147418, + "grad_norm": 0.12601187825202942, + "learning_rate": 4.462441234109115e-06, + "loss": 0.0008, + "step": 1853 + }, + { + "epoch": 1.6446464198625583, + "grad_norm": 0.19904936850070953, + "learning_rate": 4.457681677242107e-06, + "loss": 0.0011, + "step": 1854 + }, + { + "epoch": 1.6455331412103746, + "grad_norm": 0.5217944383621216, + "learning_rate": 4.452922617587017e-06, + "loss": 0.0056, + "step": 1855 + }, + { + "epoch": 1.646419862558191, + "grad_norm": 0.21843411028385162, + "learning_rate": 4.448164059507078e-06, + "loss": 0.0014, + "step": 1856 + }, + { + "epoch": 1.6473065839060075, + "grad_norm": 0.1688118726015091, + "learning_rate": 4.44340600736506e-06, + "loss": 0.002, + "step": 1857 + }, + { + "epoch": 1.648193305253824, + "grad_norm": 0.12506018579006195, + "learning_rate": 4.438648465523277e-06, + "loss": 0.0014, + "step": 1858 + }, + { + "epoch": 1.6490800266016405, + "grad_norm": 0.139232337474823, + "learning_rate": 4.433891438343566e-06, + "loss": 0.0009, + "step": 1859 + }, + { + "epoch": 1.6499667479494569, + "grad_norm": 0.2715148627758026, + "learning_rate": 4.4291349301873e-06, + "loss": 0.0021, + "step": 1860 + }, + { + "epoch": 1.6508534692972732, + "grad_norm": 0.3670569360256195, + "learning_rate": 4.424378945415368e-06, + "loss": 0.0041, + "step": 1861 + }, + { + "epoch": 1.6517401906450897, + "grad_norm": 0.06340687721967697, + "learning_rate": 4.419623488388189e-06, + "loss": 0.0005, + "step": 1862 + }, + { + "epoch": 1.6526269119929062, + "grad_norm": 0.22658933699131012, + "learning_rate": 4.4148685634656905e-06, + "loss": 0.0017, + "step": 1863 + }, + { + "epoch": 1.6535136333407228, + "grad_norm": 0.33755749464035034, + "learning_rate": 4.410114175007313e-06, + "loss": 0.0033, + "step": 1864 + }, + { + "epoch": 1.654400354688539, + "grad_norm": 0.31697922945022583, + "learning_rate": 4.405360327372011e-06, + "loss": 0.0015, + "step": 1865 + }, + { + "epoch": 1.6552870760363556, + "grad_norm": 0.13214468955993652, + "learning_rate": 4.4006070249182345e-06, + "loss": 0.0007, + "step": 1866 + }, + { + "epoch": 1.656173797384172, + "grad_norm": 0.15995311737060547, + "learning_rate": 4.395854272003942e-06, + "loss": 0.0007, + "step": 1867 + }, + { + "epoch": 1.6570605187319885, + "grad_norm": 0.028402239084243774, + "learning_rate": 4.391102072986581e-06, + "loss": 0.0001, + "step": 1868 + }, + { + "epoch": 1.657947240079805, + "grad_norm": 0.054573215544223785, + "learning_rate": 4.386350432223096e-06, + "loss": 0.0004, + "step": 1869 + }, + { + "epoch": 1.6588339614276215, + "grad_norm": 0.1543371081352234, + "learning_rate": 4.381599354069919e-06, + "loss": 0.0009, + "step": 1870 + }, + { + "epoch": 1.6597206827754378, + "grad_norm": 0.21532535552978516, + "learning_rate": 4.3768488428829625e-06, + "loss": 0.0011, + "step": 1871 + }, + { + "epoch": 1.6606074041232541, + "grad_norm": 0.21318702399730682, + "learning_rate": 4.372098903017624e-06, + "loss": 0.0006, + "step": 1872 + }, + { + "epoch": 1.6614941254710707, + "grad_norm": 0.18713395297527313, + "learning_rate": 4.3673495388287764e-06, + "loss": 0.0011, + "step": 1873 + }, + { + "epoch": 1.6623808468188872, + "grad_norm": 0.3029296100139618, + "learning_rate": 4.3626007546707615e-06, + "loss": 0.0017, + "step": 1874 + }, + { + "epoch": 1.6632675681667037, + "grad_norm": 0.1091129258275032, + "learning_rate": 4.357852554897391e-06, + "loss": 0.0006, + "step": 1875 + }, + { + "epoch": 1.66415428951452, + "grad_norm": 0.12485738098621368, + "learning_rate": 4.353104943861943e-06, + "loss": 0.0008, + "step": 1876 + }, + { + "epoch": 1.6650410108623364, + "grad_norm": 0.3830507695674896, + "learning_rate": 4.348357925917153e-06, + "loss": 0.0022, + "step": 1877 + }, + { + "epoch": 1.665927732210153, + "grad_norm": 0.09489868581295013, + "learning_rate": 4.343611505415213e-06, + "loss": 0.0004, + "step": 1878 + }, + { + "epoch": 1.6668144535579694, + "grad_norm": 0.44516703486442566, + "learning_rate": 4.338865686707771e-06, + "loss": 0.0008, + "step": 1879 + }, + { + "epoch": 1.667701174905786, + "grad_norm": 0.6598436832427979, + "learning_rate": 4.334120474145918e-06, + "loss": 0.0062, + "step": 1880 + }, + { + "epoch": 1.6685878962536023, + "grad_norm": 0.015342209488153458, + "learning_rate": 4.329375872080191e-06, + "loss": 0.0001, + "step": 1881 + }, + { + "epoch": 1.6694746176014188, + "grad_norm": 0.002708827145397663, + "learning_rate": 4.324631884860569e-06, + "loss": 0.0, + "step": 1882 + }, + { + "epoch": 1.670361338949235, + "grad_norm": 0.09634935110807419, + "learning_rate": 4.3198885168364656e-06, + "loss": 0.0005, + "step": 1883 + }, + { + "epoch": 1.6712480602970516, + "grad_norm": 0.09328538179397583, + "learning_rate": 4.315145772356729e-06, + "loss": 0.0003, + "step": 1884 + }, + { + "epoch": 1.6721347816448682, + "grad_norm": 0.05238676443696022, + "learning_rate": 4.310403655769629e-06, + "loss": 0.0002, + "step": 1885 + }, + { + "epoch": 1.6730215029926847, + "grad_norm": 0.3598534166812897, + "learning_rate": 4.305662171422872e-06, + "loss": 0.0042, + "step": 1886 + }, + { + "epoch": 1.673908224340501, + "grad_norm": 0.11761850118637085, + "learning_rate": 4.3009213236635725e-06, + "loss": 0.0005, + "step": 1887 + }, + { + "epoch": 1.6747949456883173, + "grad_norm": 0.04603422060608864, + "learning_rate": 4.296181116838268e-06, + "loss": 0.0002, + "step": 1888 + }, + { + "epoch": 1.6756816670361339, + "grad_norm": 0.0496135875582695, + "learning_rate": 4.291441555292904e-06, + "loss": 0.0003, + "step": 1889 + }, + { + "epoch": 1.6765683883839504, + "grad_norm": 0.24192200601100922, + "learning_rate": 4.2867026433728405e-06, + "loss": 0.0006, + "step": 1890 + }, + { + "epoch": 1.677455109731767, + "grad_norm": 0.7403578162193298, + "learning_rate": 4.281964385422837e-06, + "loss": 0.0053, + "step": 1891 + }, + { + "epoch": 1.6783418310795832, + "grad_norm": 0.05213829502463341, + "learning_rate": 4.277226785787054e-06, + "loss": 0.0002, + "step": 1892 + }, + { + "epoch": 1.6792285524273995, + "grad_norm": 0.18640650808811188, + "learning_rate": 4.272489848809053e-06, + "loss": 0.0009, + "step": 1893 + }, + { + "epoch": 1.680115273775216, + "grad_norm": 0.07999394834041595, + "learning_rate": 4.2677535788317796e-06, + "loss": 0.0003, + "step": 1894 + }, + { + "epoch": 1.6810019951230326, + "grad_norm": 0.04001365602016449, + "learning_rate": 4.263017980197575e-06, + "loss": 0.0002, + "step": 1895 + }, + { + "epoch": 1.6818887164708491, + "grad_norm": 0.2568782567977905, + "learning_rate": 4.258283057248161e-06, + "loss": 0.0016, + "step": 1896 + }, + { + "epoch": 1.6827754378186655, + "grad_norm": 0.323002427816391, + "learning_rate": 4.253548814324644e-06, + "loss": 0.0023, + "step": 1897 + }, + { + "epoch": 1.683662159166482, + "grad_norm": 0.30060967803001404, + "learning_rate": 4.248815255767503e-06, + "loss": 0.0016, + "step": 1898 + }, + { + "epoch": 1.6845488805142983, + "grad_norm": 0.3999418616294861, + "learning_rate": 4.2440823859165884e-06, + "loss": 0.0017, + "step": 1899 + }, + { + "epoch": 1.6854356018621148, + "grad_norm": 0.049183495342731476, + "learning_rate": 4.239350209111125e-06, + "loss": 0.0001, + "step": 1900 + }, + { + "epoch": 1.6863223232099314, + "grad_norm": 0.33251315355300903, + "learning_rate": 4.234618729689699e-06, + "loss": 0.0067, + "step": 1901 + }, + { + "epoch": 1.6872090445577477, + "grad_norm": 0.3990643620491028, + "learning_rate": 4.229887951990255e-06, + "loss": 0.0063, + "step": 1902 + }, + { + "epoch": 1.6880957659055642, + "grad_norm": 0.07162373512983322, + "learning_rate": 4.225157880350095e-06, + "loss": 0.0002, + "step": 1903 + }, + { + "epoch": 1.6889824872533805, + "grad_norm": 0.231755331158638, + "learning_rate": 4.22042851910588e-06, + "loss": 0.0026, + "step": 1904 + }, + { + "epoch": 1.689869208601197, + "grad_norm": 0.12471719831228256, + "learning_rate": 4.2156998725936105e-06, + "loss": 0.0005, + "step": 1905 + }, + { + "epoch": 1.6907559299490136, + "grad_norm": 0.032764095813035965, + "learning_rate": 4.210971945148635e-06, + "loss": 0.0002, + "step": 1906 + }, + { + "epoch": 1.6916426512968301, + "grad_norm": 0.1930425763130188, + "learning_rate": 4.206244741105649e-06, + "loss": 0.0043, + "step": 1907 + }, + { + "epoch": 1.6925293726446464, + "grad_norm": 0.24028733372688293, + "learning_rate": 4.201518264798674e-06, + "loss": 0.0008, + "step": 1908 + }, + { + "epoch": 1.6934160939924627, + "grad_norm": 0.12100367993116379, + "learning_rate": 4.196792520561072e-06, + "loss": 0.0007, + "step": 1909 + }, + { + "epoch": 1.6943028153402793, + "grad_norm": 0.05092811957001686, + "learning_rate": 4.19206751272553e-06, + "loss": 0.0002, + "step": 1910 + }, + { + "epoch": 1.6951895366880958, + "grad_norm": 0.06047261506319046, + "learning_rate": 4.187343245624065e-06, + "loss": 0.0003, + "step": 1911 + }, + { + "epoch": 1.6960762580359123, + "grad_norm": 0.005897665396332741, + "learning_rate": 4.182619723588007e-06, + "loss": 0.0, + "step": 1912 + }, + { + "epoch": 1.6969629793837286, + "grad_norm": 0.21145783364772797, + "learning_rate": 4.177896950948008e-06, + "loss": 0.0014, + "step": 1913 + }, + { + "epoch": 1.697849700731545, + "grad_norm": 0.0033418999519199133, + "learning_rate": 4.173174932034035e-06, + "loss": 0.0, + "step": 1914 + }, + { + "epoch": 1.6987364220793615, + "grad_norm": 0.2090112268924713, + "learning_rate": 4.168453671175358e-06, + "loss": 0.0008, + "step": 1915 + }, + { + "epoch": 1.699623143427178, + "grad_norm": 0.042710330337285995, + "learning_rate": 4.163733172700555e-06, + "loss": 0.0003, + "step": 1916 + }, + { + "epoch": 1.7005098647749946, + "grad_norm": 0.2604596018791199, + "learning_rate": 4.159013440937506e-06, + "loss": 0.0009, + "step": 1917 + }, + { + "epoch": 1.7013965861228109, + "grad_norm": 0.07750516384840012, + "learning_rate": 4.1542944802133876e-06, + "loss": 0.0002, + "step": 1918 + }, + { + "epoch": 1.7022833074706274, + "grad_norm": 0.06373310834169388, + "learning_rate": 4.149576294854668e-06, + "loss": 0.0002, + "step": 1919 + }, + { + "epoch": 1.7031700288184437, + "grad_norm": 0.5672020316123962, + "learning_rate": 4.1448588891871055e-06, + "loss": 0.0034, + "step": 1920 + }, + { + "epoch": 1.7040567501662602, + "grad_norm": 0.11927378922700882, + "learning_rate": 4.140142267535744e-06, + "loss": 0.0006, + "step": 1921 + }, + { + "epoch": 1.7049434715140768, + "grad_norm": 0.3349062204360962, + "learning_rate": 4.135426434224909e-06, + "loss": 0.0025, + "step": 1922 + }, + { + "epoch": 1.7058301928618933, + "grad_norm": 0.38301882147789, + "learning_rate": 4.130711393578201e-06, + "loss": 0.0005, + "step": 1923 + }, + { + "epoch": 1.7067169142097096, + "grad_norm": 0.16485494375228882, + "learning_rate": 4.125997149918494e-06, + "loss": 0.0011, + "step": 1924 + }, + { + "epoch": 1.707603635557526, + "grad_norm": 0.13596734404563904, + "learning_rate": 4.1212837075679366e-06, + "loss": 0.001, + "step": 1925 + }, + { + "epoch": 1.7084903569053425, + "grad_norm": 0.08278538286685944, + "learning_rate": 4.116571070847935e-06, + "loss": 0.0003, + "step": 1926 + }, + { + "epoch": 1.709377078253159, + "grad_norm": 0.34995728731155396, + "learning_rate": 4.111859244079159e-06, + "loss": 0.0035, + "step": 1927 + }, + { + "epoch": 1.7102637996009755, + "grad_norm": 0.025332273915410042, + "learning_rate": 4.107148231581543e-06, + "loss": 0.0001, + "step": 1928 + }, + { + "epoch": 1.7111505209487918, + "grad_norm": 0.05006008595228195, + "learning_rate": 4.102438037674265e-06, + "loss": 0.0002, + "step": 1929 + }, + { + "epoch": 1.7120372422966081, + "grad_norm": 0.05314438045024872, + "learning_rate": 4.097728666675757e-06, + "loss": 0.0002, + "step": 1930 + }, + { + "epoch": 1.7129239636444247, + "grad_norm": 0.5813071727752686, + "learning_rate": 4.093020122903694e-06, + "loss": 0.0022, + "step": 1931 + }, + { + "epoch": 1.7138106849922412, + "grad_norm": 0.2786867916584015, + "learning_rate": 4.088312410674999e-06, + "loss": 0.001, + "step": 1932 + }, + { + "epoch": 1.7146974063400577, + "grad_norm": 0.26664793491363525, + "learning_rate": 4.0836055343058265e-06, + "loss": 0.0006, + "step": 1933 + }, + { + "epoch": 1.715584127687874, + "grad_norm": 0.0041130343452095985, + "learning_rate": 4.078899498111563e-06, + "loss": 0.0, + "step": 1934 + }, + { + "epoch": 1.7164708490356906, + "grad_norm": 0.18312929570674896, + "learning_rate": 4.074194306406834e-06, + "loss": 0.0006, + "step": 1935 + }, + { + "epoch": 1.717357570383507, + "grad_norm": 0.41918063163757324, + "learning_rate": 4.069489963505482e-06, + "loss": 0.004, + "step": 1936 + }, + { + "epoch": 1.7182442917313234, + "grad_norm": 0.30160436034202576, + "learning_rate": 4.0647864737205735e-06, + "loss": 0.0044, + "step": 1937 + }, + { + "epoch": 1.71913101307914, + "grad_norm": 0.7079899907112122, + "learning_rate": 4.060083841364392e-06, + "loss": 0.0154, + "step": 1938 + }, + { + "epoch": 1.7200177344269565, + "grad_norm": 0.5802380442619324, + "learning_rate": 4.055382070748441e-06, + "loss": 0.006, + "step": 1939 + }, + { + "epoch": 1.7209044557747728, + "grad_norm": 0.022900115698575974, + "learning_rate": 4.050681166183427e-06, + "loss": 0.0001, + "step": 1940 + }, + { + "epoch": 1.7217911771225891, + "grad_norm": 0.04548019915819168, + "learning_rate": 4.045981131979263e-06, + "loss": 0.0003, + "step": 1941 + }, + { + "epoch": 1.7226778984704056, + "grad_norm": 0.09870709478855133, + "learning_rate": 4.041281972445069e-06, + "loss": 0.0007, + "step": 1942 + }, + { + "epoch": 1.7235646198182222, + "grad_norm": 0.07183930277824402, + "learning_rate": 4.036583691889159e-06, + "loss": 0.0004, + "step": 1943 + }, + { + "epoch": 1.7244513411660387, + "grad_norm": 0.055653832852840424, + "learning_rate": 4.0318862946190396e-06, + "loss": 0.0002, + "step": 1944 + }, + { + "epoch": 1.725338062513855, + "grad_norm": 0.019786400720477104, + "learning_rate": 4.027189784941412e-06, + "loss": 0.0001, + "step": 1945 + }, + { + "epoch": 1.7262247838616713, + "grad_norm": 0.08150695264339447, + "learning_rate": 4.0224941671621625e-06, + "loss": 0.0004, + "step": 1946 + }, + { + "epoch": 1.7271115052094879, + "grad_norm": 0.4023008346557617, + "learning_rate": 4.017799445586358e-06, + "loss": 0.0025, + "step": 1947 + }, + { + "epoch": 1.7279982265573044, + "grad_norm": 0.04819345846772194, + "learning_rate": 4.013105624518244e-06, + "loss": 0.0003, + "step": 1948 + }, + { + "epoch": 1.728884947905121, + "grad_norm": 0.1354270726442337, + "learning_rate": 4.0084127082612414e-06, + "loss": 0.0005, + "step": 1949 + }, + { + "epoch": 1.7297716692529372, + "grad_norm": 0.3102273941040039, + "learning_rate": 4.003720701117943e-06, + "loss": 0.0016, + "step": 1950 + }, + { + "epoch": 1.7306583906007535, + "grad_norm": 0.07029831409454346, + "learning_rate": 3.999029607390103e-06, + "loss": 0.0003, + "step": 1951 + }, + { + "epoch": 1.73154511194857, + "grad_norm": 0.06340515613555908, + "learning_rate": 3.994339431378642e-06, + "loss": 0.0001, + "step": 1952 + }, + { + "epoch": 1.7324318332963866, + "grad_norm": 0.026167303323745728, + "learning_rate": 3.989650177383641e-06, + "loss": 0.0002, + "step": 1953 + }, + { + "epoch": 1.7333185546442031, + "grad_norm": 0.019675275310873985, + "learning_rate": 3.9849618497043316e-06, + "loss": 0.0001, + "step": 1954 + }, + { + "epoch": 1.7342052759920195, + "grad_norm": 0.7322620153427124, + "learning_rate": 3.980274452639097e-06, + "loss": 0.0028, + "step": 1955 + }, + { + "epoch": 1.735091997339836, + "grad_norm": 0.01168559119105339, + "learning_rate": 3.975587990485468e-06, + "loss": 0.0001, + "step": 1956 + }, + { + "epoch": 1.7359787186876523, + "grad_norm": 0.06696490198373795, + "learning_rate": 3.97090246754012e-06, + "loss": 0.0005, + "step": 1957 + }, + { + "epoch": 1.7368654400354688, + "grad_norm": 0.3530293107032776, + "learning_rate": 3.966217888098866e-06, + "loss": 0.0013, + "step": 1958 + }, + { + "epoch": 1.7377521613832854, + "grad_norm": 0.5210719108581543, + "learning_rate": 3.961534256456648e-06, + "loss": 0.0061, + "step": 1959 + }, + { + "epoch": 1.738638882731102, + "grad_norm": 0.28391262888908386, + "learning_rate": 3.9568515769075525e-06, + "loss": 0.0019, + "step": 1960 + }, + { + "epoch": 1.7395256040789182, + "grad_norm": 0.2902699112892151, + "learning_rate": 3.95216985374478e-06, + "loss": 0.002, + "step": 1961 + }, + { + "epoch": 1.7404123254267345, + "grad_norm": 0.1008397787809372, + "learning_rate": 3.94748909126066e-06, + "loss": 0.0009, + "step": 1962 + }, + { + "epoch": 1.741299046774551, + "grad_norm": 0.1551593691110611, + "learning_rate": 3.94280929374664e-06, + "loss": 0.0004, + "step": 1963 + }, + { + "epoch": 1.7421857681223676, + "grad_norm": 0.3223740756511688, + "learning_rate": 3.938130465493286e-06, + "loss": 0.0028, + "step": 1964 + }, + { + "epoch": 1.7430724894701841, + "grad_norm": 0.03544246777892113, + "learning_rate": 3.933452610790271e-06, + "loss": 0.0003, + "step": 1965 + }, + { + "epoch": 1.7439592108180004, + "grad_norm": 0.04058455675840378, + "learning_rate": 3.928775733926376e-06, + "loss": 0.0002, + "step": 1966 + }, + { + "epoch": 1.7448459321658167, + "grad_norm": 0.18618978559970856, + "learning_rate": 3.924099839189488e-06, + "loss": 0.001, + "step": 1967 + }, + { + "epoch": 1.7457326535136333, + "grad_norm": 0.4316830039024353, + "learning_rate": 3.9194249308665915e-06, + "loss": 0.0041, + "step": 1968 + }, + { + "epoch": 1.7466193748614498, + "grad_norm": 0.32625532150268555, + "learning_rate": 3.914751013243769e-06, + "loss": 0.0018, + "step": 1969 + }, + { + "epoch": 1.7475060962092663, + "grad_norm": 0.18915483355522156, + "learning_rate": 3.91007809060619e-06, + "loss": 0.0011, + "step": 1970 + }, + { + "epoch": 1.7483928175570826, + "grad_norm": 0.44998809695243835, + "learning_rate": 3.905406167238116e-06, + "loss": 0.0061, + "step": 1971 + }, + { + "epoch": 1.7492795389048992, + "grad_norm": 0.22313672304153442, + "learning_rate": 3.9007352474228925e-06, + "loss": 0.0019, + "step": 1972 + }, + { + "epoch": 1.7501662602527155, + "grad_norm": 0.018498016521334648, + "learning_rate": 3.896065335442941e-06, + "loss": 0.0001, + "step": 1973 + }, + { + "epoch": 1.751052981600532, + "grad_norm": 0.4142586886882782, + "learning_rate": 3.891396435579763e-06, + "loss": 0.0029, + "step": 1974 + }, + { + "epoch": 1.7519397029483486, + "grad_norm": 0.2485865354537964, + "learning_rate": 3.886728552113931e-06, + "loss": 0.0008, + "step": 1975 + }, + { + "epoch": 1.752826424296165, + "grad_norm": 0.2031048834323883, + "learning_rate": 3.882061689325084e-06, + "loss": 0.0007, + "step": 1976 + }, + { + "epoch": 1.7537131456439814, + "grad_norm": 0.23484554886817932, + "learning_rate": 3.877395851491926e-06, + "loss": 0.0009, + "step": 1977 + }, + { + "epoch": 1.7545998669917977, + "grad_norm": 0.13688446581363678, + "learning_rate": 3.872731042892225e-06, + "loss": 0.0007, + "step": 1978 + }, + { + "epoch": 1.7554865883396142, + "grad_norm": 0.15276920795440674, + "learning_rate": 3.868067267802798e-06, + "loss": 0.001, + "step": 1979 + }, + { + "epoch": 1.7563733096874308, + "grad_norm": 0.4296773672103882, + "learning_rate": 3.8634045304995185e-06, + "loss": 0.0086, + "step": 1980 + }, + { + "epoch": 1.7572600310352473, + "grad_norm": 0.19526557624340057, + "learning_rate": 3.858742835257313e-06, + "loss": 0.0011, + "step": 1981 + }, + { + "epoch": 1.7581467523830636, + "grad_norm": 0.5239609479904175, + "learning_rate": 3.854082186350145e-06, + "loss": 0.0028, + "step": 1982 + }, + { + "epoch": 1.75903347373088, + "grad_norm": 0.13047945499420166, + "learning_rate": 3.849422588051022e-06, + "loss": 0.0007, + "step": 1983 + }, + { + "epoch": 1.7599201950786965, + "grad_norm": 0.2438761293888092, + "learning_rate": 3.844764044631986e-06, + "loss": 0.0023, + "step": 1984 + }, + { + "epoch": 1.760806916426513, + "grad_norm": 0.034038957208395004, + "learning_rate": 3.840106560364119e-06, + "loss": 0.0001, + "step": 1985 + }, + { + "epoch": 1.7616936377743295, + "grad_norm": 0.11754358559846878, + "learning_rate": 3.835450139517524e-06, + "loss": 0.0004, + "step": 1986 + }, + { + "epoch": 1.7625803591221458, + "grad_norm": 0.49829450249671936, + "learning_rate": 3.83079478636133e-06, + "loss": 0.0027, + "step": 1987 + }, + { + "epoch": 1.7634670804699624, + "grad_norm": 0.04286176338791847, + "learning_rate": 3.826140505163694e-06, + "loss": 0.0002, + "step": 1988 + }, + { + "epoch": 1.7643538018177787, + "grad_norm": 0.1739022135734558, + "learning_rate": 3.821487300191782e-06, + "loss": 0.0007, + "step": 1989 + }, + { + "epoch": 1.7652405231655952, + "grad_norm": 0.07864267379045486, + "learning_rate": 3.816835175711775e-06, + "loss": 0.0005, + "step": 1990 + }, + { + "epoch": 1.7661272445134117, + "grad_norm": 0.05060047656297684, + "learning_rate": 3.812184135988868e-06, + "loss": 0.0001, + "step": 1991 + }, + { + "epoch": 1.7670139658612283, + "grad_norm": 0.30648404359817505, + "learning_rate": 3.8075341852872583e-06, + "loss": 0.001, + "step": 1992 + }, + { + "epoch": 1.7679006872090446, + "grad_norm": 0.15932850539684296, + "learning_rate": 3.802885327870143e-06, + "loss": 0.0016, + "step": 1993 + }, + { + "epoch": 1.768787408556861, + "grad_norm": 0.021853476762771606, + "learning_rate": 3.7982375679997186e-06, + "loss": 0.0002, + "step": 1994 + }, + { + "epoch": 1.7696741299046774, + "grad_norm": 0.0491185262799263, + "learning_rate": 3.7935909099371784e-06, + "loss": 0.0002, + "step": 1995 + }, + { + "epoch": 1.770560851252494, + "grad_norm": 0.08880337327718735, + "learning_rate": 3.7889453579426993e-06, + "loss": 0.0005, + "step": 1996 + }, + { + "epoch": 1.7714475726003105, + "grad_norm": 0.0074866595678031445, + "learning_rate": 3.7843009162754503e-06, + "loss": 0.0001, + "step": 1997 + }, + { + "epoch": 1.7723342939481268, + "grad_norm": 0.43862977623939514, + "learning_rate": 3.7796575891935748e-06, + "loss": 0.0025, + "step": 1998 + }, + { + "epoch": 1.7732210152959431, + "grad_norm": 0.1807900071144104, + "learning_rate": 3.7750153809542052e-06, + "loss": 0.0005, + "step": 1999 + }, + { + "epoch": 1.7741077366437596, + "grad_norm": 0.1964527666568756, + "learning_rate": 3.7703742958134383e-06, + "loss": 0.001, + "step": 2000 + }, + { + "epoch": 1.7749944579915762, + "grad_norm": 0.6390202641487122, + "learning_rate": 3.765734338026343e-06, + "loss": 0.0026, + "step": 2001 + }, + { + "epoch": 1.7758811793393927, + "grad_norm": 0.2554784417152405, + "learning_rate": 3.761095511846962e-06, + "loss": 0.0028, + "step": 2002 + }, + { + "epoch": 1.776767900687209, + "grad_norm": 0.42926931381225586, + "learning_rate": 3.756457821528292e-06, + "loss": 0.0014, + "step": 2003 + }, + { + "epoch": 1.7776546220350253, + "grad_norm": 0.44991567730903625, + "learning_rate": 3.7518212713222905e-06, + "loss": 0.0023, + "step": 2004 + }, + { + "epoch": 1.7785413433828419, + "grad_norm": 0.2629556655883789, + "learning_rate": 3.7471858654798697e-06, + "loss": 0.0007, + "step": 2005 + }, + { + "epoch": 1.7794280647306584, + "grad_norm": 0.003782880026847124, + "learning_rate": 3.742551608250896e-06, + "loss": 0.0, + "step": 2006 + }, + { + "epoch": 1.780314786078475, + "grad_norm": 0.17819075286388397, + "learning_rate": 3.73791850388418e-06, + "loss": 0.0009, + "step": 2007 + }, + { + "epoch": 1.7812015074262912, + "grad_norm": 0.5463308095932007, + "learning_rate": 3.733286556627473e-06, + "loss": 0.003, + "step": 2008 + }, + { + "epoch": 1.7820882287741078, + "grad_norm": 0.07021505385637283, + "learning_rate": 3.7286557707274707e-06, + "loss": 0.0002, + "step": 2009 + }, + { + "epoch": 1.782974950121924, + "grad_norm": 0.3599243760108948, + "learning_rate": 3.7240261504298002e-06, + "loss": 0.0067, + "step": 2010 + }, + { + "epoch": 1.7838616714697406, + "grad_norm": 0.04074014723300934, + "learning_rate": 3.7193976999790216e-06, + "loss": 0.0002, + "step": 2011 + }, + { + "epoch": 1.7847483928175571, + "grad_norm": 0.08518295735120773, + "learning_rate": 3.714770423618618e-06, + "loss": 0.0002, + "step": 2012 + }, + { + "epoch": 1.7856351141653737, + "grad_norm": 0.10783907026052475, + "learning_rate": 3.7101443255910062e-06, + "loss": 0.0003, + "step": 2013 + }, + { + "epoch": 1.78652183551319, + "grad_norm": 0.31293633580207825, + "learning_rate": 3.7055194101375127e-06, + "loss": 0.0023, + "step": 2014 + }, + { + "epoch": 1.7874085568610063, + "grad_norm": 0.5668357014656067, + "learning_rate": 3.700895681498381e-06, + "loss": 0.0053, + "step": 2015 + }, + { + "epoch": 1.7882952782088228, + "grad_norm": 0.01921240985393524, + "learning_rate": 3.696273143912775e-06, + "loss": 0.0001, + "step": 2016 + }, + { + "epoch": 1.7891819995566394, + "grad_norm": 0.018799765035510063, + "learning_rate": 3.6916518016187573e-06, + "loss": 0.0001, + "step": 2017 + }, + { + "epoch": 1.790068720904456, + "grad_norm": 0.11832908540964127, + "learning_rate": 3.6870316588532966e-06, + "loss": 0.0004, + "step": 2018 + }, + { + "epoch": 1.7909554422522722, + "grad_norm": 0.10770263522863388, + "learning_rate": 3.6824127198522652e-06, + "loss": 0.0005, + "step": 2019 + }, + { + "epoch": 1.7918421636000885, + "grad_norm": 0.01235107984393835, + "learning_rate": 3.6777949888504293e-06, + "loss": 0.0, + "step": 2020 + }, + { + "epoch": 1.792728884947905, + "grad_norm": 0.006794508080929518, + "learning_rate": 3.6731784700814476e-06, + "loss": 0.0, + "step": 2021 + }, + { + "epoch": 1.7936156062957216, + "grad_norm": 0.01359777431935072, + "learning_rate": 3.668563167777868e-06, + "loss": 0.0001, + "step": 2022 + }, + { + "epoch": 1.7945023276435381, + "grad_norm": 0.13341398537158966, + "learning_rate": 3.6639490861711224e-06, + "loss": 0.0005, + "step": 2023 + }, + { + "epoch": 1.7953890489913544, + "grad_norm": 0.022680554538965225, + "learning_rate": 3.659336229491525e-06, + "loss": 0.0001, + "step": 2024 + }, + { + "epoch": 1.796275770339171, + "grad_norm": 0.06998679041862488, + "learning_rate": 3.654724601968266e-06, + "loss": 0.0002, + "step": 2025 + }, + { + "epoch": 1.7971624916869873, + "grad_norm": 0.20700500905513763, + "learning_rate": 3.6501142078294053e-06, + "loss": 0.003, + "step": 2026 + }, + { + "epoch": 1.7980492130348038, + "grad_norm": 0.07539916038513184, + "learning_rate": 3.6455050513018807e-06, + "loss": 0.0003, + "step": 2027 + }, + { + "epoch": 1.7989359343826203, + "grad_norm": 0.45003005862236023, + "learning_rate": 3.6408971366114874e-06, + "loss": 0.0026, + "step": 2028 + }, + { + "epoch": 1.7998226557304369, + "grad_norm": 0.022108634933829308, + "learning_rate": 3.6362904679828814e-06, + "loss": 0.0001, + "step": 2029 + }, + { + "epoch": 1.8007093770782532, + "grad_norm": 0.24113281071186066, + "learning_rate": 3.6316850496395863e-06, + "loss": 0.0008, + "step": 2030 + }, + { + "epoch": 1.8015960984260695, + "grad_norm": 0.10309310257434845, + "learning_rate": 3.6270808858039677e-06, + "loss": 0.0003, + "step": 2031 + }, + { + "epoch": 1.802482819773886, + "grad_norm": 0.404070645570755, + "learning_rate": 3.6224779806972472e-06, + "loss": 0.0009, + "step": 2032 + }, + { + "epoch": 1.8033695411217026, + "grad_norm": 0.25489571690559387, + "learning_rate": 3.617876338539489e-06, + "loss": 0.0014, + "step": 2033 + }, + { + "epoch": 1.804256262469519, + "grad_norm": 0.9427079558372498, + "learning_rate": 3.6132759635496057e-06, + "loss": 0.005, + "step": 2034 + }, + { + "epoch": 1.8051429838173354, + "grad_norm": 0.02262832224369049, + "learning_rate": 3.6086768599453416e-06, + "loss": 0.0001, + "step": 2035 + }, + { + "epoch": 1.8060297051651517, + "grad_norm": 0.33301153779029846, + "learning_rate": 3.6040790319432756e-06, + "loss": 0.0031, + "step": 2036 + }, + { + "epoch": 1.8069164265129682, + "grad_norm": 0.20669648051261902, + "learning_rate": 3.5994824837588235e-06, + "loss": 0.0021, + "step": 2037 + }, + { + "epoch": 1.8078031478607848, + "grad_norm": 0.12887296080589294, + "learning_rate": 3.594887219606221e-06, + "loss": 0.0007, + "step": 2038 + }, + { + "epoch": 1.8086898692086013, + "grad_norm": 0.017986498773097992, + "learning_rate": 3.5902932436985293e-06, + "loss": 0.0002, + "step": 2039 + }, + { + "epoch": 1.8095765905564176, + "grad_norm": 0.11879166960716248, + "learning_rate": 3.585700560247626e-06, + "loss": 0.0005, + "step": 2040 + }, + { + "epoch": 1.810463311904234, + "grad_norm": 0.011947376653552055, + "learning_rate": 3.58110917346421e-06, + "loss": 0.0001, + "step": 2041 + }, + { + "epoch": 1.8113500332520505, + "grad_norm": 0.427824467420578, + "learning_rate": 3.5765190875577855e-06, + "loss": 0.0025, + "step": 2042 + }, + { + "epoch": 1.812236754599867, + "grad_norm": 0.5713347792625427, + "learning_rate": 3.571930306736663e-06, + "loss": 0.0046, + "step": 2043 + }, + { + "epoch": 1.8131234759476835, + "grad_norm": 0.021777095273137093, + "learning_rate": 3.567342835207964e-06, + "loss": 0.0001, + "step": 2044 + }, + { + "epoch": 1.8140101972954998, + "grad_norm": 0.026599332690238953, + "learning_rate": 3.562756677177602e-06, + "loss": 0.0001, + "step": 2045 + }, + { + "epoch": 1.8148969186433164, + "grad_norm": 0.10613416880369186, + "learning_rate": 3.5581718368502906e-06, + "loss": 0.0005, + "step": 2046 + }, + { + "epoch": 1.8157836399911327, + "grad_norm": 0.5284315347671509, + "learning_rate": 3.5535883184295316e-06, + "loss": 0.0016, + "step": 2047 + }, + { + "epoch": 1.8166703613389492, + "grad_norm": 0.087979257106781, + "learning_rate": 3.549006126117619e-06, + "loss": 0.0005, + "step": 2048 + }, + { + "epoch": 1.8175570826867657, + "grad_norm": 0.006895984057337046, + "learning_rate": 3.54442526411563e-06, + "loss": 0.0001, + "step": 2049 + }, + { + "epoch": 1.8184438040345823, + "grad_norm": 0.15001380443572998, + "learning_rate": 3.5398457366234174e-06, + "loss": 0.0002, + "step": 2050 + }, + { + "epoch": 1.8193305253823986, + "grad_norm": 0.011246156878769398, + "learning_rate": 3.535267547839617e-06, + "loss": 0.0001, + "step": 2051 + }, + { + "epoch": 1.820217246730215, + "grad_norm": 0.006199950352311134, + "learning_rate": 3.5306907019616356e-06, + "loss": 0.0, + "step": 2052 + }, + { + "epoch": 1.8211039680780314, + "grad_norm": 0.013676552101969719, + "learning_rate": 3.5261152031856457e-06, + "loss": 0.0001, + "step": 2053 + }, + { + "epoch": 1.821990689425848, + "grad_norm": 0.10605849325656891, + "learning_rate": 3.5215410557065856e-06, + "loss": 0.0003, + "step": 2054 + }, + { + "epoch": 1.8228774107736645, + "grad_norm": 0.05242261290550232, + "learning_rate": 3.516968263718159e-06, + "loss": 0.0001, + "step": 2055 + }, + { + "epoch": 1.8237641321214808, + "grad_norm": 0.12605375051498413, + "learning_rate": 3.512396831412822e-06, + "loss": 0.0007, + "step": 2056 + }, + { + "epoch": 1.8246508534692971, + "grad_norm": 0.004360876511782408, + "learning_rate": 3.507826762981784e-06, + "loss": 0.0, + "step": 2057 + }, + { + "epoch": 1.8255375748171137, + "grad_norm": 0.4643060564994812, + "learning_rate": 3.50325806261501e-06, + "loss": 0.0056, + "step": 2058 + }, + { + "epoch": 1.8264242961649302, + "grad_norm": 0.06710820645093918, + "learning_rate": 3.498690734501204e-06, + "loss": 0.0003, + "step": 2059 + }, + { + "epoch": 1.8273110175127467, + "grad_norm": 0.36204835772514343, + "learning_rate": 3.4941247828278142e-06, + "loss": 0.0048, + "step": 2060 + }, + { + "epoch": 1.828197738860563, + "grad_norm": 0.143352210521698, + "learning_rate": 3.4895602117810256e-06, + "loss": 0.0004, + "step": 2061 + }, + { + "epoch": 1.8290844602083796, + "grad_norm": 0.15158715844154358, + "learning_rate": 3.484997025545762e-06, + "loss": 0.0008, + "step": 2062 + }, + { + "epoch": 1.8299711815561959, + "grad_norm": 0.0470881387591362, + "learning_rate": 3.480435228305673e-06, + "loss": 0.0002, + "step": 2063 + }, + { + "epoch": 1.8308579029040124, + "grad_norm": 0.15281467139720917, + "learning_rate": 3.4758748242431363e-06, + "loss": 0.001, + "step": 2064 + }, + { + "epoch": 1.831744624251829, + "grad_norm": 0.44346439838409424, + "learning_rate": 3.4713158175392493e-06, + "loss": 0.0027, + "step": 2065 + }, + { + "epoch": 1.8326313455996455, + "grad_norm": 0.09658630192279816, + "learning_rate": 3.466758212373836e-06, + "loss": 0.0005, + "step": 2066 + }, + { + "epoch": 1.8335180669474618, + "grad_norm": 0.19065168499946594, + "learning_rate": 3.4622020129254273e-06, + "loss": 0.0014, + "step": 2067 + }, + { + "epoch": 1.834404788295278, + "grad_norm": 0.39815741777420044, + "learning_rate": 3.4576472233712677e-06, + "loss": 0.0017, + "step": 2068 + }, + { + "epoch": 1.8352915096430946, + "grad_norm": 0.020850352942943573, + "learning_rate": 3.4530938478873134e-06, + "loss": 0.0001, + "step": 2069 + }, + { + "epoch": 1.8361782309909112, + "grad_norm": 0.0858813151717186, + "learning_rate": 3.448541890648217e-06, + "loss": 0.0005, + "step": 2070 + }, + { + "epoch": 1.8370649523387277, + "grad_norm": 0.016161078587174416, + "learning_rate": 3.4439913558273374e-06, + "loss": 0.0001, + "step": 2071 + }, + { + "epoch": 1.837951673686544, + "grad_norm": 0.08752348273992538, + "learning_rate": 3.439442247596724e-06, + "loss": 0.0004, + "step": 2072 + }, + { + "epoch": 1.8388383950343603, + "grad_norm": 0.07176404446363449, + "learning_rate": 3.4348945701271218e-06, + "loss": 0.0003, + "step": 2073 + }, + { + "epoch": 1.8397251163821768, + "grad_norm": 0.021734774112701416, + "learning_rate": 3.4303483275879633e-06, + "loss": 0.0001, + "step": 2074 + }, + { + "epoch": 1.8406118377299934, + "grad_norm": 0.13391822576522827, + "learning_rate": 3.425803524147364e-06, + "loss": 0.0006, + "step": 2075 + }, + { + "epoch": 1.84149855907781, + "grad_norm": 0.43593868613243103, + "learning_rate": 3.421260163972121e-06, + "loss": 0.0015, + "step": 2076 + }, + { + "epoch": 1.8423852804256262, + "grad_norm": 0.037805862724781036, + "learning_rate": 3.4167182512277105e-06, + "loss": 0.0001, + "step": 2077 + }, + { + "epoch": 1.8432720017734427, + "grad_norm": 0.019509362056851387, + "learning_rate": 3.412177790078277e-06, + "loss": 0.0001, + "step": 2078 + }, + { + "epoch": 1.844158723121259, + "grad_norm": 0.09484810382127762, + "learning_rate": 3.4076387846866353e-06, + "loss": 0.0006, + "step": 2079 + }, + { + "epoch": 1.8450454444690756, + "grad_norm": 0.046936165541410446, + "learning_rate": 3.403101239214271e-06, + "loss": 0.0002, + "step": 2080 + }, + { + "epoch": 1.8459321658168921, + "grad_norm": 0.11847790330648422, + "learning_rate": 3.3985651578213243e-06, + "loss": 0.0005, + "step": 2081 + }, + { + "epoch": 1.8468188871647087, + "grad_norm": 0.09800742566585541, + "learning_rate": 3.394030544666593e-06, + "loss": 0.0003, + "step": 2082 + }, + { + "epoch": 1.847705608512525, + "grad_norm": 0.29517197608947754, + "learning_rate": 3.3894974039075377e-06, + "loss": 0.0024, + "step": 2083 + }, + { + "epoch": 1.8485923298603413, + "grad_norm": 0.0667266771197319, + "learning_rate": 3.3849657397002588e-06, + "loss": 0.0002, + "step": 2084 + }, + { + "epoch": 1.8494790512081578, + "grad_norm": 0.3751622140407562, + "learning_rate": 3.3804355561995085e-06, + "loss": 0.0022, + "step": 2085 + }, + { + "epoch": 1.8503657725559743, + "grad_norm": 0.6706212162971497, + "learning_rate": 3.375906857558676e-06, + "loss": 0.0024, + "step": 2086 + }, + { + "epoch": 1.8512524939037909, + "grad_norm": 0.08441176265478134, + "learning_rate": 3.3713796479298e-06, + "loss": 0.0003, + "step": 2087 + }, + { + "epoch": 1.8521392152516072, + "grad_norm": 0.3854922652244568, + "learning_rate": 3.3668539314635428e-06, + "loss": 0.0024, + "step": 2088 + }, + { + "epoch": 1.8530259365994235, + "grad_norm": 0.40729087591171265, + "learning_rate": 3.3623297123092007e-06, + "loss": 0.0042, + "step": 2089 + }, + { + "epoch": 1.85391265794724, + "grad_norm": 0.05538596957921982, + "learning_rate": 3.357806994614703e-06, + "loss": 0.0003, + "step": 2090 + }, + { + "epoch": 1.8547993792950566, + "grad_norm": 0.6646614670753479, + "learning_rate": 3.353285782526596e-06, + "loss": 0.005, + "step": 2091 + }, + { + "epoch": 1.855686100642873, + "grad_norm": 0.0477665476500988, + "learning_rate": 3.348766080190046e-06, + "loss": 0.0001, + "step": 2092 + }, + { + "epoch": 1.8565728219906894, + "grad_norm": 0.1342734843492508, + "learning_rate": 3.3442478917488384e-06, + "loss": 0.0009, + "step": 2093 + }, + { + "epoch": 1.8574595433385057, + "grad_norm": 0.36796683073043823, + "learning_rate": 3.33973122134537e-06, + "loss": 0.0031, + "step": 2094 + }, + { + "epoch": 1.8583462646863222, + "grad_norm": 0.5031121969223022, + "learning_rate": 3.335216073120643e-06, + "loss": 0.0021, + "step": 2095 + }, + { + "epoch": 1.8592329860341388, + "grad_norm": 0.4082201421260834, + "learning_rate": 3.330702451214266e-06, + "loss": 0.0012, + "step": 2096 + }, + { + "epoch": 1.8601197073819553, + "grad_norm": 0.6267004013061523, + "learning_rate": 3.3261903597644497e-06, + "loss": 0.0019, + "step": 2097 + }, + { + "epoch": 1.8610064287297716, + "grad_norm": 0.12733691930770874, + "learning_rate": 3.321679802907997e-06, + "loss": 0.0004, + "step": 2098 + }, + { + "epoch": 1.8618931500775882, + "grad_norm": 0.030501484870910645, + "learning_rate": 3.31717078478031e-06, + "loss": 0.0001, + "step": 2099 + }, + { + "epoch": 1.8627798714254045, + "grad_norm": 0.41649121046066284, + "learning_rate": 3.3126633095153745e-06, + "loss": 0.003, + "step": 2100 + }, + { + "epoch": 1.863666592773221, + "grad_norm": 0.25532305240631104, + "learning_rate": 3.308157381245767e-06, + "loss": 0.0014, + "step": 2101 + }, + { + "epoch": 1.8645533141210375, + "grad_norm": 0.020190473645925522, + "learning_rate": 3.3036530041026405e-06, + "loss": 0.0001, + "step": 2102 + }, + { + "epoch": 1.865440035468854, + "grad_norm": 0.28363004326820374, + "learning_rate": 3.2991501822157266e-06, + "loss": 0.0005, + "step": 2103 + }, + { + "epoch": 1.8663267568166704, + "grad_norm": 0.008495116606354713, + "learning_rate": 3.2946489197133367e-06, + "loss": 0.0, + "step": 2104 + }, + { + "epoch": 1.8672134781644867, + "grad_norm": 0.5957363247871399, + "learning_rate": 3.2901492207223475e-06, + "loss": 0.0014, + "step": 2105 + }, + { + "epoch": 1.8681001995123032, + "grad_norm": 0.049815353006124496, + "learning_rate": 3.285651089368202e-06, + "loss": 0.0002, + "step": 2106 + }, + { + "epoch": 1.8689869208601197, + "grad_norm": 0.1587996631860733, + "learning_rate": 3.281154529774905e-06, + "loss": 0.001, + "step": 2107 + }, + { + "epoch": 1.8698736422079363, + "grad_norm": 0.058114007115364075, + "learning_rate": 3.276659546065027e-06, + "loss": 0.0002, + "step": 2108 + }, + { + "epoch": 1.8707603635557526, + "grad_norm": 0.010123222135007381, + "learning_rate": 3.2721661423596866e-06, + "loss": 0.0, + "step": 2109 + }, + { + "epoch": 1.871647084903569, + "grad_norm": 0.038756851106882095, + "learning_rate": 3.2676743227785545e-06, + "loss": 0.0002, + "step": 2110 + }, + { + "epoch": 1.8725338062513854, + "grad_norm": 0.0023471799213439226, + "learning_rate": 3.263184091439855e-06, + "loss": 0.0, + "step": 2111 + }, + { + "epoch": 1.873420527599202, + "grad_norm": 0.5632521510124207, + "learning_rate": 3.25869545246035e-06, + "loss": 0.0043, + "step": 2112 + }, + { + "epoch": 1.8743072489470185, + "grad_norm": 0.0417516715824604, + "learning_rate": 3.2542084099553426e-06, + "loss": 0.0002, + "step": 2113 + }, + { + "epoch": 1.8751939702948348, + "grad_norm": 0.30594179034233093, + "learning_rate": 3.249722968038673e-06, + "loss": 0.0008, + "step": 2114 + }, + { + "epoch": 1.8760806916426513, + "grad_norm": 0.009588775224983692, + "learning_rate": 3.245239130822716e-06, + "loss": 0.0001, + "step": 2115 + }, + { + "epoch": 1.8769674129904677, + "grad_norm": 0.3712175190448761, + "learning_rate": 3.2407569024183734e-06, + "loss": 0.0059, + "step": 2116 + }, + { + "epoch": 1.8778541343382842, + "grad_norm": 0.48841285705566406, + "learning_rate": 3.2362762869350685e-06, + "loss": 0.0025, + "step": 2117 + }, + { + "epoch": 1.8787408556861007, + "grad_norm": 0.19866853952407837, + "learning_rate": 3.231797288480753e-06, + "loss": 0.001, + "step": 2118 + }, + { + "epoch": 1.8796275770339173, + "grad_norm": 0.017454639077186584, + "learning_rate": 3.227319911161891e-06, + "loss": 0.0001, + "step": 2119 + }, + { + "epoch": 1.8805142983817336, + "grad_norm": 0.03350000083446503, + "learning_rate": 3.2228441590834608e-06, + "loss": 0.0002, + "step": 2120 + }, + { + "epoch": 1.8814010197295499, + "grad_norm": 1.9975967407226562, + "learning_rate": 3.218370036348952e-06, + "loss": 0.0062, + "step": 2121 + }, + { + "epoch": 1.8822877410773664, + "grad_norm": 0.011665020138025284, + "learning_rate": 3.21389754706036e-06, + "loss": 0.0, + "step": 2122 + }, + { + "epoch": 1.883174462425183, + "grad_norm": 0.7389943599700928, + "learning_rate": 3.2094266953181817e-06, + "loss": 0.0015, + "step": 2123 + }, + { + "epoch": 1.8840611837729995, + "grad_norm": 0.293914258480072, + "learning_rate": 3.204957485221413e-06, + "loss": 0.001, + "step": 2124 + }, + { + "epoch": 1.8849479051208158, + "grad_norm": 0.09404734522104263, + "learning_rate": 3.2004899208675468e-06, + "loss": 0.0004, + "step": 2125 + }, + { + "epoch": 1.885834626468632, + "grad_norm": 0.5290479063987732, + "learning_rate": 3.1960240063525648e-06, + "loss": 0.0008, + "step": 2126 + }, + { + "epoch": 1.8867213478164486, + "grad_norm": 0.06976180523633957, + "learning_rate": 3.1915597457709364e-06, + "loss": 0.0003, + "step": 2127 + }, + { + "epoch": 1.8876080691642652, + "grad_norm": 0.2480458766222, + "learning_rate": 3.187097143215612e-06, + "loss": 0.0029, + "step": 2128 + }, + { + "epoch": 1.8884947905120817, + "grad_norm": 0.45888465642929077, + "learning_rate": 3.18263620277803e-06, + "loss": 0.0034, + "step": 2129 + }, + { + "epoch": 1.889381511859898, + "grad_norm": 0.4611452519893646, + "learning_rate": 3.1781769285480977e-06, + "loss": 0.0034, + "step": 2130 + }, + { + "epoch": 1.8902682332077143, + "grad_norm": 0.011414594016969204, + "learning_rate": 3.1737193246141933e-06, + "loss": 0.0001, + "step": 2131 + }, + { + "epoch": 1.8911549545555308, + "grad_norm": 0.008071592077612877, + "learning_rate": 3.1692633950631723e-06, + "loss": 0.0001, + "step": 2132 + }, + { + "epoch": 1.8920416759033474, + "grad_norm": 0.09630648791790009, + "learning_rate": 3.164809143980348e-06, + "loss": 0.0005, + "step": 2133 + }, + { + "epoch": 1.892928397251164, + "grad_norm": 0.03766035661101341, + "learning_rate": 3.160356575449496e-06, + "loss": 0.0002, + "step": 2134 + }, + { + "epoch": 1.8938151185989802, + "grad_norm": 0.011658454313874245, + "learning_rate": 3.1559056935528486e-06, + "loss": 0.0001, + "step": 2135 + }, + { + "epoch": 1.8947018399467968, + "grad_norm": 0.24089200794696808, + "learning_rate": 3.1514565023710964e-06, + "loss": 0.0032, + "step": 2136 + }, + { + "epoch": 1.895588561294613, + "grad_norm": 0.134489506483078, + "learning_rate": 3.1470090059833752e-06, + "loss": 0.0003, + "step": 2137 + }, + { + "epoch": 1.8964752826424296, + "grad_norm": 0.2924594283103943, + "learning_rate": 3.1425632084672663e-06, + "loss": 0.0011, + "step": 2138 + }, + { + "epoch": 1.8973620039902461, + "grad_norm": 0.14403395354747772, + "learning_rate": 3.1381191138988e-06, + "loss": 0.0005, + "step": 2139 + }, + { + "epoch": 1.8982487253380627, + "grad_norm": 0.24188585579395294, + "learning_rate": 3.133676726352438e-06, + "loss": 0.0015, + "step": 2140 + }, + { + "epoch": 1.899135446685879, + "grad_norm": 0.31872034072875977, + "learning_rate": 3.129236049901081e-06, + "loss": 0.0027, + "step": 2141 + }, + { + "epoch": 1.9000221680336953, + "grad_norm": 0.2157914638519287, + "learning_rate": 3.124797088616056e-06, + "loss": 0.0016, + "step": 2142 + }, + { + "epoch": 1.9009088893815118, + "grad_norm": 0.24194389581680298, + "learning_rate": 3.120359846567127e-06, + "loss": 0.0004, + "step": 2143 + }, + { + "epoch": 1.9017956107293283, + "grad_norm": 0.052015915513038635, + "learning_rate": 3.1159243278224727e-06, + "loss": 0.0003, + "step": 2144 + }, + { + "epoch": 1.9026823320771449, + "grad_norm": 0.07202459126710892, + "learning_rate": 3.1114905364486934e-06, + "loss": 0.0005, + "step": 2145 + }, + { + "epoch": 1.9035690534249612, + "grad_norm": 0.12261311709880829, + "learning_rate": 3.107058476510812e-06, + "loss": 0.0009, + "step": 2146 + }, + { + "epoch": 1.9044557747727775, + "grad_norm": 0.12206902354955673, + "learning_rate": 3.1026281520722555e-06, + "loss": 0.0004, + "step": 2147 + }, + { + "epoch": 1.905342496120594, + "grad_norm": 0.054110489785671234, + "learning_rate": 3.098199567194867e-06, + "loss": 0.0003, + "step": 2148 + }, + { + "epoch": 1.9062292174684106, + "grad_norm": 0.10819480568170547, + "learning_rate": 3.0937727259388877e-06, + "loss": 0.0005, + "step": 2149 + }, + { + "epoch": 1.907115938816227, + "grad_norm": 0.2881719470024109, + "learning_rate": 3.0893476323629657e-06, + "loss": 0.0014, + "step": 2150 + }, + { + "epoch": 1.9080026601640434, + "grad_norm": 0.5649845004081726, + "learning_rate": 3.0849242905241462e-06, + "loss": 0.0016, + "step": 2151 + }, + { + "epoch": 1.90888938151186, + "grad_norm": 0.09853377938270569, + "learning_rate": 3.0805027044778647e-06, + "loss": 0.0005, + "step": 2152 + }, + { + "epoch": 1.9097761028596763, + "grad_norm": 0.25022897124290466, + "learning_rate": 3.0760828782779496e-06, + "loss": 0.001, + "step": 2153 + }, + { + "epoch": 1.9106628242074928, + "grad_norm": 0.47225239872932434, + "learning_rate": 3.071664815976618e-06, + "loss": 0.003, + "step": 2154 + }, + { + "epoch": 1.9115495455553093, + "grad_norm": 0.6766409277915955, + "learning_rate": 3.067248521624465e-06, + "loss": 0.0027, + "step": 2155 + }, + { + "epoch": 1.9124362669031258, + "grad_norm": 0.1587907373905182, + "learning_rate": 3.062833999270467e-06, + "loss": 0.0005, + "step": 2156 + }, + { + "epoch": 1.9133229882509422, + "grad_norm": 0.01313844695687294, + "learning_rate": 3.0584212529619777e-06, + "loss": 0.0001, + "step": 2157 + }, + { + "epoch": 1.9142097095987585, + "grad_norm": 0.0655895546078682, + "learning_rate": 3.0540102867447204e-06, + "loss": 0.0002, + "step": 2158 + }, + { + "epoch": 1.915096430946575, + "grad_norm": 0.4105219841003418, + "learning_rate": 3.049601104662784e-06, + "loss": 0.0025, + "step": 2159 + }, + { + "epoch": 1.9159831522943915, + "grad_norm": 0.0187697596848011, + "learning_rate": 3.045193710758628e-06, + "loss": 0.0001, + "step": 2160 + }, + { + "epoch": 1.916869873642208, + "grad_norm": 0.01989288628101349, + "learning_rate": 3.0407881090730677e-06, + "loss": 0.0001, + "step": 2161 + }, + { + "epoch": 1.9177565949900244, + "grad_norm": 0.01322174072265625, + "learning_rate": 3.0363843036452762e-06, + "loss": 0.0001, + "step": 2162 + }, + { + "epoch": 1.9186433163378407, + "grad_norm": 0.03816716745495796, + "learning_rate": 3.031982298512777e-06, + "loss": 0.0002, + "step": 2163 + }, + { + "epoch": 1.9195300376856572, + "grad_norm": 0.4447609484195709, + "learning_rate": 3.027582097711451e-06, + "loss": 0.0022, + "step": 2164 + }, + { + "epoch": 1.9204167590334738, + "grad_norm": 0.10088513046503067, + "learning_rate": 3.0231837052755176e-06, + "loss": 0.0004, + "step": 2165 + }, + { + "epoch": 1.9213034803812903, + "grad_norm": 0.3154107928276062, + "learning_rate": 3.0187871252375394e-06, + "loss": 0.006, + "step": 2166 + }, + { + "epoch": 1.9221902017291066, + "grad_norm": 0.03484093025326729, + "learning_rate": 3.0143923616284176e-06, + "loss": 0.0002, + "step": 2167 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 0.062375616282224655, + "learning_rate": 3.0099994184773918e-06, + "loss": 0.0003, + "step": 2168 + }, + { + "epoch": 1.9239636444247394, + "grad_norm": 0.1312718689441681, + "learning_rate": 3.0056082998120283e-06, + "loss": 0.0005, + "step": 2169 + }, + { + "epoch": 1.924850365772556, + "grad_norm": 0.030425092205405235, + "learning_rate": 3.0012190096582195e-06, + "loss": 0.0001, + "step": 2170 + }, + { + "epoch": 1.9257370871203725, + "grad_norm": 0.022313162684440613, + "learning_rate": 2.996831552040188e-06, + "loss": 0.0001, + "step": 2171 + }, + { + "epoch": 1.926623808468189, + "grad_norm": 0.49964627623558044, + "learning_rate": 2.9924459309804693e-06, + "loss": 0.0058, + "step": 2172 + }, + { + "epoch": 1.9275105298160053, + "grad_norm": 0.014140648767352104, + "learning_rate": 2.9880621504999196e-06, + "loss": 0.0001, + "step": 2173 + }, + { + "epoch": 1.9283972511638217, + "grad_norm": 0.29723238945007324, + "learning_rate": 2.9836802146177034e-06, + "loss": 0.0012, + "step": 2174 + }, + { + "epoch": 1.9292839725116382, + "grad_norm": 0.04823289066553116, + "learning_rate": 2.979300127351299e-06, + "loss": 0.0001, + "step": 2175 + }, + { + "epoch": 1.9301706938594547, + "grad_norm": 0.8130165338516235, + "learning_rate": 2.9749218927164864e-06, + "loss": 0.0102, + "step": 2176 + }, + { + "epoch": 1.9310574152072713, + "grad_norm": 0.0337294302880764, + "learning_rate": 2.9705455147273467e-06, + "loss": 0.0002, + "step": 2177 + }, + { + "epoch": 1.9319441365550876, + "grad_norm": 0.25490546226501465, + "learning_rate": 2.9661709973962626e-06, + "loss": 0.0006, + "step": 2178 + }, + { + "epoch": 1.9328308579029039, + "grad_norm": 0.02805035188794136, + "learning_rate": 2.961798344733907e-06, + "loss": 0.0001, + "step": 2179 + }, + { + "epoch": 1.9337175792507204, + "grad_norm": 0.1288713812828064, + "learning_rate": 2.9574275607492454e-06, + "loss": 0.0006, + "step": 2180 + }, + { + "epoch": 1.934604300598537, + "grad_norm": 0.654469907283783, + "learning_rate": 2.9530586494495267e-06, + "loss": 0.0095, + "step": 2181 + }, + { + "epoch": 1.9354910219463535, + "grad_norm": 0.045253507792949677, + "learning_rate": 2.948691614840289e-06, + "loss": 0.0002, + "step": 2182 + }, + { + "epoch": 1.9363777432941698, + "grad_norm": 0.0367266871035099, + "learning_rate": 2.9443264609253443e-06, + "loss": 0.0003, + "step": 2183 + }, + { + "epoch": 1.937264464641986, + "grad_norm": 0.014604181051254272, + "learning_rate": 2.9399631917067806e-06, + "loss": 0.0001, + "step": 2184 + }, + { + "epoch": 1.9381511859898026, + "grad_norm": 0.02937917225062847, + "learning_rate": 2.935601811184964e-06, + "loss": 0.0001, + "step": 2185 + }, + { + "epoch": 1.9390379073376192, + "grad_norm": 0.01375783421099186, + "learning_rate": 2.9312423233585214e-06, + "loss": 0.0001, + "step": 2186 + }, + { + "epoch": 1.9399246286854357, + "grad_norm": 0.25753626227378845, + "learning_rate": 2.926884732224349e-06, + "loss": 0.0019, + "step": 2187 + }, + { + "epoch": 1.940811350033252, + "grad_norm": 0.09537655860185623, + "learning_rate": 2.922529041777599e-06, + "loss": 0.0003, + "step": 2188 + }, + { + "epoch": 1.9416980713810685, + "grad_norm": 0.1624821424484253, + "learning_rate": 2.9181752560116894e-06, + "loss": 0.0012, + "step": 2189 + }, + { + "epoch": 1.9425847927288848, + "grad_norm": 0.021007006987929344, + "learning_rate": 2.913823378918287e-06, + "loss": 0.0001, + "step": 2190 + }, + { + "epoch": 1.9434715140767014, + "grad_norm": 0.268496572971344, + "learning_rate": 2.9094734144873037e-06, + "loss": 0.0018, + "step": 2191 + }, + { + "epoch": 1.944358235424518, + "grad_norm": 0.04218098148703575, + "learning_rate": 2.9051253667069107e-06, + "loss": 0.0002, + "step": 2192 + }, + { + "epoch": 1.9452449567723344, + "grad_norm": 0.07987596094608307, + "learning_rate": 2.900779239563508e-06, + "loss": 0.0003, + "step": 2193 + }, + { + "epoch": 1.9461316781201508, + "grad_norm": 0.25981444120407104, + "learning_rate": 2.896435037041746e-06, + "loss": 0.0022, + "step": 2194 + }, + { + "epoch": 1.947018399467967, + "grad_norm": 0.2508253753185272, + "learning_rate": 2.892092763124499e-06, + "loss": 0.0014, + "step": 2195 + }, + { + "epoch": 1.9479051208157836, + "grad_norm": 0.2507166862487793, + "learning_rate": 2.8877524217928866e-06, + "loss": 0.0016, + "step": 2196 + }, + { + "epoch": 1.9487918421636001, + "grad_norm": 0.15352725982666016, + "learning_rate": 2.8834140170262446e-06, + "loss": 0.0013, + "step": 2197 + }, + { + "epoch": 1.9496785635114167, + "grad_norm": 0.021937541663646698, + "learning_rate": 2.8790775528021415e-06, + "loss": 0.0002, + "step": 2198 + }, + { + "epoch": 1.950565284859233, + "grad_norm": 0.12473490834236145, + "learning_rate": 2.874743033096361e-06, + "loss": 0.0006, + "step": 2199 + }, + { + "epoch": 1.9514520062070493, + "grad_norm": 0.3052102327346802, + "learning_rate": 2.8704104618829105e-06, + "loss": 0.0024, + "step": 2200 + }, + { + "epoch": 1.9523387275548658, + "grad_norm": 0.21318025887012482, + "learning_rate": 2.866079843134002e-06, + "loss": 0.0013, + "step": 2201 + }, + { + "epoch": 1.9532254489026823, + "grad_norm": 0.02442754991352558, + "learning_rate": 2.8617511808200645e-06, + "loss": 0.0001, + "step": 2202 + }, + { + "epoch": 1.9541121702504989, + "grad_norm": 0.30121952295303345, + "learning_rate": 2.857424478909731e-06, + "loss": 0.0004, + "step": 2203 + }, + { + "epoch": 1.9549988915983152, + "grad_norm": 0.1846991777420044, + "learning_rate": 2.85309974136984e-06, + "loss": 0.0011, + "step": 2204 + }, + { + "epoch": 1.9558856129461317, + "grad_norm": 0.5073151588439941, + "learning_rate": 2.8487769721654197e-06, + "loss": 0.0019, + "step": 2205 + }, + { + "epoch": 1.956772334293948, + "grad_norm": 0.014602754265069962, + "learning_rate": 2.8444561752597088e-06, + "loss": 0.0001, + "step": 2206 + }, + { + "epoch": 1.9576590556417646, + "grad_norm": 0.3761642873287201, + "learning_rate": 2.840137354614122e-06, + "loss": 0.0011, + "step": 2207 + }, + { + "epoch": 1.958545776989581, + "grad_norm": 0.044096872210502625, + "learning_rate": 2.8358205141882735e-06, + "loss": 0.0003, + "step": 2208 + }, + { + "epoch": 1.9594324983373976, + "grad_norm": 0.026059670373797417, + "learning_rate": 2.8315056579399563e-06, + "loss": 0.0002, + "step": 2209 + }, + { + "epoch": 1.960319219685214, + "grad_norm": 0.07342034578323364, + "learning_rate": 2.8271927898251484e-06, + "loss": 0.0002, + "step": 2210 + }, + { + "epoch": 1.9612059410330303, + "grad_norm": 0.34168630838394165, + "learning_rate": 2.822881913797998e-06, + "loss": 0.0018, + "step": 2211 + }, + { + "epoch": 1.9620926623808468, + "grad_norm": 0.15169137716293335, + "learning_rate": 2.8185730338108336e-06, + "loss": 0.0007, + "step": 2212 + }, + { + "epoch": 1.9629793837286633, + "grad_norm": 0.09074265509843826, + "learning_rate": 2.8142661538141514e-06, + "loss": 0.0004, + "step": 2213 + }, + { + "epoch": 1.9638661050764799, + "grad_norm": 0.03273439034819603, + "learning_rate": 2.809961277756614e-06, + "loss": 0.0001, + "step": 2214 + }, + { + "epoch": 1.9647528264242962, + "grad_norm": 0.048764728009700775, + "learning_rate": 2.8056584095850494e-06, + "loss": 0.0003, + "step": 2215 + }, + { + "epoch": 1.9656395477721125, + "grad_norm": 0.002200677525252104, + "learning_rate": 2.801357553244434e-06, + "loss": 0.0, + "step": 2216 + }, + { + "epoch": 1.966526269119929, + "grad_norm": 0.004921832587569952, + "learning_rate": 2.7970587126779173e-06, + "loss": 0.0, + "step": 2217 + }, + { + "epoch": 1.9674129904677455, + "grad_norm": 0.4720174968242645, + "learning_rate": 2.792761891826785e-06, + "loss": 0.0042, + "step": 2218 + }, + { + "epoch": 1.968299711815562, + "grad_norm": 0.008707297965884209, + "learning_rate": 2.788467094630479e-06, + "loss": 0.0, + "step": 2219 + }, + { + "epoch": 1.9691864331633784, + "grad_norm": 0.006691179238259792, + "learning_rate": 2.7841743250265844e-06, + "loss": 0.0, + "step": 2220 + }, + { + "epoch": 1.9700731545111947, + "grad_norm": 0.008885095827281475, + "learning_rate": 2.779883586950828e-06, + "loss": 0.0, + "step": 2221 + }, + { + "epoch": 1.9709598758590112, + "grad_norm": 0.11954095214605331, + "learning_rate": 2.7755948843370706e-06, + "loss": 0.0006, + "step": 2222 + }, + { + "epoch": 1.9718465972068278, + "grad_norm": 0.0582672655582428, + "learning_rate": 2.771308221117309e-06, + "loss": 0.0001, + "step": 2223 + }, + { + "epoch": 1.9727333185546443, + "grad_norm": 0.005173410754650831, + "learning_rate": 2.7670236012216722e-06, + "loss": 0.0, + "step": 2224 + }, + { + "epoch": 1.9736200399024606, + "grad_norm": 0.0285594891756773, + "learning_rate": 2.7627410285784164e-06, + "loss": 0.0001, + "step": 2225 + }, + { + "epoch": 1.9745067612502771, + "grad_norm": 0.0014770013513043523, + "learning_rate": 2.75846050711391e-06, + "loss": 0.0, + "step": 2226 + }, + { + "epoch": 1.9753934825980934, + "grad_norm": 0.299988329410553, + "learning_rate": 2.754182040752661e-06, + "loss": 0.0012, + "step": 2227 + }, + { + "epoch": 1.97628020394591, + "grad_norm": 0.20655956864356995, + "learning_rate": 2.749905633417273e-06, + "loss": 0.0015, + "step": 2228 + }, + { + "epoch": 1.9771669252937265, + "grad_norm": 0.07857910543680191, + "learning_rate": 2.7456312890284755e-06, + "loss": 0.0002, + "step": 2229 + }, + { + "epoch": 1.978053646641543, + "grad_norm": 0.029224885627627373, + "learning_rate": 2.741359011505095e-06, + "loss": 0.0001, + "step": 2230 + }, + { + "epoch": 1.9789403679893594, + "grad_norm": 0.22522462904453278, + "learning_rate": 2.737088804764077e-06, + "loss": 0.0007, + "step": 2231 + }, + { + "epoch": 1.9798270893371757, + "grad_norm": 0.01182663906365633, + "learning_rate": 2.7328206727204565e-06, + "loss": 0.0001, + "step": 2232 + }, + { + "epoch": 1.9807138106849922, + "grad_norm": 0.06838423013687134, + "learning_rate": 2.7285546192873723e-06, + "loss": 0.0003, + "step": 2233 + }, + { + "epoch": 1.9816005320328087, + "grad_norm": 0.6740751266479492, + "learning_rate": 2.7242906483760562e-06, + "loss": 0.0011, + "step": 2234 + }, + { + "epoch": 1.9824872533806253, + "grad_norm": 0.061367567628622055, + "learning_rate": 2.7200287638958323e-06, + "loss": 0.0002, + "step": 2235 + }, + { + "epoch": 1.9833739747284416, + "grad_norm": 0.29687654972076416, + "learning_rate": 2.7157689697541057e-06, + "loss": 0.0014, + "step": 2236 + }, + { + "epoch": 1.9842606960762579, + "grad_norm": 0.3020949363708496, + "learning_rate": 2.7115112698563727e-06, + "loss": 0.0029, + "step": 2237 + }, + { + "epoch": 1.9851474174240744, + "grad_norm": 0.021518951281905174, + "learning_rate": 2.7072556681062044e-06, + "loss": 0.0001, + "step": 2238 + }, + { + "epoch": 1.986034138771891, + "grad_norm": 0.15249262750148773, + "learning_rate": 2.703002168405252e-06, + "loss": 0.0004, + "step": 2239 + }, + { + "epoch": 1.9869208601197075, + "grad_norm": 0.0033653369173407555, + "learning_rate": 2.698750774653237e-06, + "loss": 0.0, + "step": 2240 + }, + { + "epoch": 1.9878075814675238, + "grad_norm": 0.21170556545257568, + "learning_rate": 2.6945014907479495e-06, + "loss": 0.0003, + "step": 2241 + }, + { + "epoch": 1.9886943028153403, + "grad_norm": 0.0033403278794139624, + "learning_rate": 2.6902543205852496e-06, + "loss": 0.0, + "step": 2242 + }, + { + "epoch": 1.9895810241631566, + "grad_norm": 0.052868060767650604, + "learning_rate": 2.686009268059052e-06, + "loss": 0.0001, + "step": 2243 + }, + { + "epoch": 1.9904677455109732, + "grad_norm": 0.057781487703323364, + "learning_rate": 2.6817663370613367e-06, + "loss": 0.0002, + "step": 2244 + }, + { + "epoch": 1.9913544668587897, + "grad_norm": 0.13221290707588196, + "learning_rate": 2.677525531482135e-06, + "loss": 0.0005, + "step": 2245 + }, + { + "epoch": 1.9922411882066062, + "grad_norm": 0.6088939309120178, + "learning_rate": 2.6732868552095337e-06, + "loss": 0.002, + "step": 2246 + }, + { + "epoch": 1.9931279095544225, + "grad_norm": 0.16569292545318604, + "learning_rate": 2.669050312129658e-06, + "loss": 0.0005, + "step": 2247 + }, + { + "epoch": 1.9940146309022388, + "grad_norm": 0.0092005031183362, + "learning_rate": 2.664815906126691e-06, + "loss": 0.0, + "step": 2248 + }, + { + "epoch": 1.9949013522500554, + "grad_norm": 0.015410387888550758, + "learning_rate": 2.6605836410828443e-06, + "loss": 0.0, + "step": 2249 + }, + { + "epoch": 1.995788073597872, + "grad_norm": 0.18164335191249847, + "learning_rate": 2.6563535208783753e-06, + "loss": 0.0007, + "step": 2250 + }, + { + "epoch": 1.9966747949456884, + "grad_norm": 0.3885621428489685, + "learning_rate": 2.652125549391565e-06, + "loss": 0.0045, + "step": 2251 + }, + { + "epoch": 1.9975615162935048, + "grad_norm": 0.009692193940281868, + "learning_rate": 2.6478997304987383e-06, + "loss": 0.0, + "step": 2252 + }, + { + "epoch": 1.998448237641321, + "grad_norm": 0.008395759388804436, + "learning_rate": 2.6436760680742325e-06, + "loss": 0.0, + "step": 2253 + }, + { + "epoch": 1.9993349589891376, + "grad_norm": 0.5207878351211548, + "learning_rate": 2.639454565990417e-06, + "loss": 0.0015, + "step": 2254 + }, + { + "epoch": 2.0008867213478165, + "grad_norm": 0.31940406560897827, + "learning_rate": 2.6352352281176763e-06, + "loss": 0.0038, + "step": 2255 + }, + { + "epoch": 2.001773442695633, + "grad_norm": 0.0426461435854435, + "learning_rate": 2.6310180583244157e-06, + "loss": 0.0002, + "step": 2256 + }, + { + "epoch": 2.001773442695633, + "eval_loss": 0.03277111053466797, + "eval_runtime": 63.8702, + "eval_samples_per_second": 3.006, + "eval_steps_per_second": 0.752, + "step": 2256 + }, + { + "epoch": 2.002660164043449, + "grad_norm": 0.3251824378967285, + "learning_rate": 2.6268030604770435e-06, + "loss": 0.0043, + "step": 2257 + }, + { + "epoch": 2.0035468853912657, + "grad_norm": 0.010555939748883247, + "learning_rate": 2.622590238439985e-06, + "loss": 0.0001, + "step": 2258 + }, + { + "epoch": 2.004433606739082, + "grad_norm": 0.003281393786892295, + "learning_rate": 2.618379596075668e-06, + "loss": 0.0, + "step": 2259 + }, + { + "epoch": 2.0053203280868988, + "grad_norm": 0.23340563476085663, + "learning_rate": 2.6141711372445216e-06, + "loss": 0.0008, + "step": 2260 + }, + { + "epoch": 2.0062070494347153, + "grad_norm": 0.07923705130815506, + "learning_rate": 2.609964865804974e-06, + "loss": 0.0001, + "step": 2261 + }, + { + "epoch": 2.0070937707825314, + "grad_norm": 0.531326174736023, + "learning_rate": 2.605760785613447e-06, + "loss": 0.0067, + "step": 2262 + }, + { + "epoch": 2.007980492130348, + "grad_norm": 0.07579508423805237, + "learning_rate": 2.6015589005243517e-06, + "loss": 0.0002, + "step": 2263 + }, + { + "epoch": 2.0088672134781644, + "grad_norm": 0.009762165136635303, + "learning_rate": 2.59735921439009e-06, + "loss": 0.0001, + "step": 2264 + }, + { + "epoch": 2.009753934825981, + "grad_norm": 0.03709012642502785, + "learning_rate": 2.593161731061046e-06, + "loss": 0.0002, + "step": 2265 + }, + { + "epoch": 2.0106406561737975, + "grad_norm": 0.0150918485596776, + "learning_rate": 2.588966454385585e-06, + "loss": 0.0001, + "step": 2266 + }, + { + "epoch": 2.011527377521614, + "grad_norm": 0.006500888615846634, + "learning_rate": 2.584773388210049e-06, + "loss": 0.0, + "step": 2267 + }, + { + "epoch": 2.01241409886943, + "grad_norm": 0.03855859860777855, + "learning_rate": 2.5805825363787496e-06, + "loss": 0.0001, + "step": 2268 + }, + { + "epoch": 2.0133008202172467, + "grad_norm": 0.0074251629412174225, + "learning_rate": 2.5763939027339775e-06, + "loss": 0.0, + "step": 2269 + }, + { + "epoch": 2.014187541565063, + "grad_norm": 0.022057898342609406, + "learning_rate": 2.572207491115979e-06, + "loss": 0.0001, + "step": 2270 + }, + { + "epoch": 2.0150742629128797, + "grad_norm": 0.08520907163619995, + "learning_rate": 2.568023305362971e-06, + "loss": 0.0003, + "step": 2271 + }, + { + "epoch": 2.0159609842606963, + "grad_norm": 0.0048646205104887486, + "learning_rate": 2.5638413493111227e-06, + "loss": 0.0, + "step": 2272 + }, + { + "epoch": 2.0168477056085123, + "grad_norm": 0.031922418624162674, + "learning_rate": 2.5596616267945683e-06, + "loss": 0.0002, + "step": 2273 + }, + { + "epoch": 2.017734426956329, + "grad_norm": 0.043785277754068375, + "learning_rate": 2.5554841416453856e-06, + "loss": 0.0001, + "step": 2274 + }, + { + "epoch": 2.0186211483041454, + "grad_norm": 0.00607112143188715, + "learning_rate": 2.551308897693607e-06, + "loss": 0.0, + "step": 2275 + }, + { + "epoch": 2.019507869651962, + "grad_norm": 0.0038311337120831013, + "learning_rate": 2.547135898767202e-06, + "loss": 0.0, + "step": 2276 + }, + { + "epoch": 2.0203945909997785, + "grad_norm": 0.023988040164113045, + "learning_rate": 2.542965148692095e-06, + "loss": 0.0001, + "step": 2277 + }, + { + "epoch": 2.0212813123475946, + "grad_norm": 0.37009167671203613, + "learning_rate": 2.5387966512921357e-06, + "loss": 0.0024, + "step": 2278 + }, + { + "epoch": 2.022168033695411, + "grad_norm": 0.0046903216280043125, + "learning_rate": 2.534630410389116e-06, + "loss": 0.0, + "step": 2279 + }, + { + "epoch": 2.0230547550432276, + "grad_norm": 0.032938845455646515, + "learning_rate": 2.530466429802756e-06, + "loss": 0.0001, + "step": 2280 + }, + { + "epoch": 2.023941476391044, + "grad_norm": 0.04838014394044876, + "learning_rate": 2.5263047133507067e-06, + "loss": 0.0002, + "step": 2281 + }, + { + "epoch": 2.0248281977388607, + "grad_norm": 0.018787918612360954, + "learning_rate": 2.5221452648485367e-06, + "loss": 0.0001, + "step": 2282 + }, + { + "epoch": 2.0257149190866772, + "grad_norm": 0.009505733847618103, + "learning_rate": 2.5179880881097406e-06, + "loss": 0.0001, + "step": 2283 + }, + { + "epoch": 2.0266016404344933, + "grad_norm": 0.010092353448271751, + "learning_rate": 2.513833186945731e-06, + "loss": 0.0, + "step": 2284 + }, + { + "epoch": 2.02748836178231, + "grad_norm": 0.521740198135376, + "learning_rate": 2.509680565165831e-06, + "loss": 0.0035, + "step": 2285 + }, + { + "epoch": 2.0283750831301264, + "grad_norm": 0.0034733442589640617, + "learning_rate": 2.505530226577275e-06, + "loss": 0.0, + "step": 2286 + }, + { + "epoch": 2.029261804477943, + "grad_norm": 0.043323416262865067, + "learning_rate": 2.5013821749852063e-06, + "loss": 0.0002, + "step": 2287 + }, + { + "epoch": 2.0301485258257594, + "grad_norm": 0.8648737668991089, + "learning_rate": 2.4972364141926663e-06, + "loss": 0.0036, + "step": 2288 + }, + { + "epoch": 2.0310352471735755, + "grad_norm": 0.24954581260681152, + "learning_rate": 2.4930929480006e-06, + "loss": 0.0037, + "step": 2289 + }, + { + "epoch": 2.031921968521392, + "grad_norm": 0.26250550150871277, + "learning_rate": 2.4889517802078477e-06, + "loss": 0.001, + "step": 2290 + }, + { + "epoch": 2.0328086898692086, + "grad_norm": 0.03862825408577919, + "learning_rate": 2.484812914611144e-06, + "loss": 0.0002, + "step": 2291 + }, + { + "epoch": 2.033695411217025, + "grad_norm": 0.04618274047970772, + "learning_rate": 2.4806763550051115e-06, + "loss": 0.0002, + "step": 2292 + }, + { + "epoch": 2.0345821325648417, + "grad_norm": 0.0045454599894583225, + "learning_rate": 2.476542105182254e-06, + "loss": 0.0, + "step": 2293 + }, + { + "epoch": 2.0354688539126577, + "grad_norm": 0.060077376663684845, + "learning_rate": 2.4724101689329694e-06, + "loss": 0.0002, + "step": 2294 + }, + { + "epoch": 2.0363555752604743, + "grad_norm": 0.016459325328469276, + "learning_rate": 2.4682805500455214e-06, + "loss": 0.0001, + "step": 2295 + }, + { + "epoch": 2.037242296608291, + "grad_norm": 0.03847726806998253, + "learning_rate": 2.464153252306059e-06, + "loss": 0.0001, + "step": 2296 + }, + { + "epoch": 2.0381290179561073, + "grad_norm": 0.14360575377941132, + "learning_rate": 2.4600282794985923e-06, + "loss": 0.0006, + "step": 2297 + }, + { + "epoch": 2.039015739303924, + "grad_norm": 0.07891727983951569, + "learning_rate": 2.4559056354050154e-06, + "loss": 0.0003, + "step": 2298 + }, + { + "epoch": 2.0399024606517404, + "grad_norm": 0.01991586573421955, + "learning_rate": 2.451785323805072e-06, + "loss": 0.0001, + "step": 2299 + }, + { + "epoch": 2.0407891819995565, + "grad_norm": 0.32266005873680115, + "learning_rate": 2.4476673484763756e-06, + "loss": 0.0011, + "step": 2300 + }, + { + "epoch": 2.041675903347373, + "grad_norm": 0.0066023049876093864, + "learning_rate": 2.4435517131943965e-06, + "loss": 0.0, + "step": 2301 + }, + { + "epoch": 2.0425626246951896, + "grad_norm": 0.9420242309570312, + "learning_rate": 2.43943842173246e-06, + "loss": 0.0041, + "step": 2302 + }, + { + "epoch": 2.043449346043006, + "grad_norm": 0.09942077100276947, + "learning_rate": 2.435327477861739e-06, + "loss": 0.0006, + "step": 2303 + }, + { + "epoch": 2.0443360673908226, + "grad_norm": 0.12234442681074142, + "learning_rate": 2.431218885351257e-06, + "loss": 0.0005, + "step": 2304 + }, + { + "epoch": 2.0452227887386387, + "grad_norm": 0.19722875952720642, + "learning_rate": 2.4271126479678824e-06, + "loss": 0.0007, + "step": 2305 + }, + { + "epoch": 2.0461095100864553, + "grad_norm": 0.1235419362783432, + "learning_rate": 2.423008769476325e-06, + "loss": 0.0003, + "step": 2306 + }, + { + "epoch": 2.046996231434272, + "grad_norm": 0.012925482355058193, + "learning_rate": 2.418907253639124e-06, + "loss": 0.0001, + "step": 2307 + }, + { + "epoch": 2.0478829527820883, + "grad_norm": 0.012632215395569801, + "learning_rate": 2.4148081042166667e-06, + "loss": 0.0001, + "step": 2308 + }, + { + "epoch": 2.048769674129905, + "grad_norm": 0.003258196171373129, + "learning_rate": 2.4107113249671567e-06, + "loss": 0.0, + "step": 2309 + }, + { + "epoch": 2.049656395477721, + "grad_norm": 0.05988994985818863, + "learning_rate": 2.4066169196466326e-06, + "loss": 0.0003, + "step": 2310 + }, + { + "epoch": 2.0505431168255375, + "grad_norm": 0.003724791342392564, + "learning_rate": 2.4025248920089544e-06, + "loss": 0.0, + "step": 2311 + }, + { + "epoch": 2.051429838173354, + "grad_norm": 0.006751666311174631, + "learning_rate": 2.3984352458058045e-06, + "loss": 0.0, + "step": 2312 + }, + { + "epoch": 2.0523165595211705, + "grad_norm": 0.002435932168737054, + "learning_rate": 2.3943479847866764e-06, + "loss": 0.0, + "step": 2313 + }, + { + "epoch": 2.053203280868987, + "grad_norm": 0.02041725628077984, + "learning_rate": 2.3902631126988808e-06, + "loss": 0.0001, + "step": 2314 + }, + { + "epoch": 2.054090002216803, + "grad_norm": 0.004865054506808519, + "learning_rate": 2.3861806332875377e-06, + "loss": 0.0, + "step": 2315 + }, + { + "epoch": 2.0549767235646197, + "grad_norm": 0.060207922011613846, + "learning_rate": 2.382100550295574e-06, + "loss": 0.0002, + "step": 2316 + }, + { + "epoch": 2.055863444912436, + "grad_norm": 0.010751526802778244, + "learning_rate": 2.37802286746372e-06, + "loss": 0.0, + "step": 2317 + }, + { + "epoch": 2.0567501662602528, + "grad_norm": 0.014327171258628368, + "learning_rate": 2.373947588530499e-06, + "loss": 0.0001, + "step": 2318 + }, + { + "epoch": 2.0576368876080693, + "grad_norm": 0.006228189449757338, + "learning_rate": 2.369874717232242e-06, + "loss": 0.0, + "step": 2319 + }, + { + "epoch": 2.058523608955886, + "grad_norm": 0.2894449830055237, + "learning_rate": 2.365804257303062e-06, + "loss": 0.0045, + "step": 2320 + }, + { + "epoch": 2.059410330303702, + "grad_norm": 0.014009971171617508, + "learning_rate": 2.3617362124748664e-06, + "loss": 0.0001, + "step": 2321 + }, + { + "epoch": 2.0602970516515184, + "grad_norm": 0.014323338866233826, + "learning_rate": 2.3576705864773477e-06, + "loss": 0.0001, + "step": 2322 + }, + { + "epoch": 2.061183772999335, + "grad_norm": 0.0071457927115261555, + "learning_rate": 2.3536073830379814e-06, + "loss": 0.0, + "step": 2323 + }, + { + "epoch": 2.0620704943471515, + "grad_norm": 0.013069824315607548, + "learning_rate": 2.349546605882018e-06, + "loss": 0.0001, + "step": 2324 + }, + { + "epoch": 2.062957215694968, + "grad_norm": 0.07725962996482849, + "learning_rate": 2.345488258732488e-06, + "loss": 0.0004, + "step": 2325 + }, + { + "epoch": 2.063843937042784, + "grad_norm": 0.1443031281232834, + "learning_rate": 2.3414323453101924e-06, + "loss": 0.0003, + "step": 2326 + }, + { + "epoch": 2.0647306583906007, + "grad_norm": 0.009164338000118732, + "learning_rate": 2.3373788693337024e-06, + "loss": 0.0001, + "step": 2327 + }, + { + "epoch": 2.065617379738417, + "grad_norm": 0.016295766457915306, + "learning_rate": 2.333327834519348e-06, + "loss": 0.0001, + "step": 2328 + }, + { + "epoch": 2.0665041010862337, + "grad_norm": 0.10227362811565399, + "learning_rate": 2.3292792445812327e-06, + "loss": 0.0002, + "step": 2329 + }, + { + "epoch": 2.0673908224340503, + "grad_norm": 0.01212971843779087, + "learning_rate": 2.3252331032312075e-06, + "loss": 0.0, + "step": 2330 + }, + { + "epoch": 2.0682775437818663, + "grad_norm": 0.4432136118412018, + "learning_rate": 2.3211894141788856e-06, + "loss": 0.0078, + "step": 2331 + }, + { + "epoch": 2.069164265129683, + "grad_norm": 0.04203218221664429, + "learning_rate": 2.317148181131624e-06, + "loss": 0.0002, + "step": 2332 + }, + { + "epoch": 2.0700509864774994, + "grad_norm": 0.07127822935581207, + "learning_rate": 2.31310940779454e-06, + "loss": 0.0003, + "step": 2333 + }, + { + "epoch": 2.070937707825316, + "grad_norm": 0.3511061370372772, + "learning_rate": 2.3090730978704832e-06, + "loss": 0.0007, + "step": 2334 + }, + { + "epoch": 2.0718244291731325, + "grad_norm": 0.4882556200027466, + "learning_rate": 2.305039255060053e-06, + "loss": 0.0008, + "step": 2335 + }, + { + "epoch": 2.0727111505209486, + "grad_norm": 0.12589851021766663, + "learning_rate": 2.301007883061584e-06, + "loss": 0.0004, + "step": 2336 + }, + { + "epoch": 2.073597871868765, + "grad_norm": 0.02481425739824772, + "learning_rate": 2.296978985571145e-06, + "loss": 0.0002, + "step": 2337 + }, + { + "epoch": 2.0744845932165816, + "grad_norm": 0.012816986069083214, + "learning_rate": 2.2929525662825384e-06, + "loss": 0.0001, + "step": 2338 + }, + { + "epoch": 2.075371314564398, + "grad_norm": 0.02135675586760044, + "learning_rate": 2.28892862888729e-06, + "loss": 0.0001, + "step": 2339 + }, + { + "epoch": 2.0762580359122147, + "grad_norm": 0.00505865877494216, + "learning_rate": 2.284907177074655e-06, + "loss": 0.0, + "step": 2340 + }, + { + "epoch": 2.0771447572600312, + "grad_norm": 0.006438021082431078, + "learning_rate": 2.2808882145316075e-06, + "loss": 0.0, + "step": 2341 + }, + { + "epoch": 2.0780314786078473, + "grad_norm": 0.020976608619093895, + "learning_rate": 2.276871744942839e-06, + "loss": 0.0001, + "step": 2342 + }, + { + "epoch": 2.078918199955664, + "grad_norm": 0.06125025451183319, + "learning_rate": 2.2728577719907576e-06, + "loss": 0.0003, + "step": 2343 + }, + { + "epoch": 2.0798049213034804, + "grad_norm": 0.029785003513097763, + "learning_rate": 2.268846299355481e-06, + "loss": 0.0001, + "step": 2344 + }, + { + "epoch": 2.080691642651297, + "grad_norm": 0.002666131593286991, + "learning_rate": 2.2648373307148314e-06, + "loss": 0.0, + "step": 2345 + }, + { + "epoch": 2.0815783639991134, + "grad_norm": 0.31236812472343445, + "learning_rate": 2.2608308697443405e-06, + "loss": 0.0016, + "step": 2346 + }, + { + "epoch": 2.0824650853469295, + "grad_norm": 0.007688076235353947, + "learning_rate": 2.2568269201172382e-06, + "loss": 0.0001, + "step": 2347 + }, + { + "epoch": 2.083351806694746, + "grad_norm": 0.06622789800167084, + "learning_rate": 2.2528254855044546e-06, + "loss": 0.0002, + "step": 2348 + }, + { + "epoch": 2.0842385280425626, + "grad_norm": 0.18980400264263153, + "learning_rate": 2.248826569574606e-06, + "loss": 0.0019, + "step": 2349 + }, + { + "epoch": 2.085125249390379, + "grad_norm": 0.025342190638184547, + "learning_rate": 2.2448301759940126e-06, + "loss": 0.0001, + "step": 2350 + }, + { + "epoch": 2.0860119707381957, + "grad_norm": 0.025142235681414604, + "learning_rate": 2.2408363084266693e-06, + "loss": 0.0001, + "step": 2351 + }, + { + "epoch": 2.086898692086012, + "grad_norm": 0.3066583573818207, + "learning_rate": 2.236844970534265e-06, + "loss": 0.001, + "step": 2352 + }, + { + "epoch": 2.0877854134338283, + "grad_norm": 0.2630266547203064, + "learning_rate": 2.2328561659761576e-06, + "loss": 0.001, + "step": 2353 + }, + { + "epoch": 2.088672134781645, + "grad_norm": 0.006459026597440243, + "learning_rate": 2.228869898409399e-06, + "loss": 0.0, + "step": 2354 + }, + { + "epoch": 2.0895588561294614, + "grad_norm": 0.02460658550262451, + "learning_rate": 2.224886171488698e-06, + "loss": 0.0001, + "step": 2355 + }, + { + "epoch": 2.090445577477278, + "grad_norm": 0.00549986120313406, + "learning_rate": 2.2209049888664453e-06, + "loss": 0.0, + "step": 2356 + }, + { + "epoch": 2.0913322988250944, + "grad_norm": 0.035316918045282364, + "learning_rate": 2.216926354192695e-06, + "loss": 0.0001, + "step": 2357 + }, + { + "epoch": 2.0922190201729105, + "grad_norm": 0.026010386645793915, + "learning_rate": 2.212950271115167e-06, + "loss": 0.0001, + "step": 2358 + }, + { + "epoch": 2.093105741520727, + "grad_norm": 0.08127646893262863, + "learning_rate": 2.2089767432792375e-06, + "loss": 0.0001, + "step": 2359 + }, + { + "epoch": 2.0939924628685436, + "grad_norm": 0.11064977198839188, + "learning_rate": 2.205005774327944e-06, + "loss": 0.0003, + "step": 2360 + }, + { + "epoch": 2.09487918421636, + "grad_norm": 0.24248307943344116, + "learning_rate": 2.2010373679019773e-06, + "loss": 0.0005, + "step": 2361 + }, + { + "epoch": 2.0957659055641766, + "grad_norm": 0.037203628569841385, + "learning_rate": 2.1970715276396783e-06, + "loss": 0.0002, + "step": 2362 + }, + { + "epoch": 2.0966526269119927, + "grad_norm": 0.007871000096201897, + "learning_rate": 2.193108257177035e-06, + "loss": 0.0001, + "step": 2363 + }, + { + "epoch": 2.0975393482598093, + "grad_norm": 0.017412377521395683, + "learning_rate": 2.189147560147682e-06, + "loss": 0.0001, + "step": 2364 + }, + { + "epoch": 2.098426069607626, + "grad_norm": 0.008841924369335175, + "learning_rate": 2.185189440182887e-06, + "loss": 0.0, + "step": 2365 + }, + { + "epoch": 2.0993127909554423, + "grad_norm": 0.0007497490732930601, + "learning_rate": 2.181233900911564e-06, + "loss": 0.0, + "step": 2366 + }, + { + "epoch": 2.100199512303259, + "grad_norm": 0.010741816833615303, + "learning_rate": 2.177280945960255e-06, + "loss": 0.0, + "step": 2367 + }, + { + "epoch": 2.101086233651075, + "grad_norm": 0.010129451751708984, + "learning_rate": 2.1733305789531363e-06, + "loss": 0.0, + "step": 2368 + }, + { + "epoch": 2.1019729549988915, + "grad_norm": 0.03930528834462166, + "learning_rate": 2.16938280351201e-06, + "loss": 0.0001, + "step": 2369 + }, + { + "epoch": 2.102859676346708, + "grad_norm": 0.004623851738870144, + "learning_rate": 2.1654376232562985e-06, + "loss": 0.0, + "step": 2370 + }, + { + "epoch": 2.1037463976945245, + "grad_norm": 0.017552360892295837, + "learning_rate": 2.161495041803054e-06, + "loss": 0.0001, + "step": 2371 + }, + { + "epoch": 2.104633119042341, + "grad_norm": 0.23550549149513245, + "learning_rate": 2.1575550627669355e-06, + "loss": 0.0016, + "step": 2372 + }, + { + "epoch": 2.1055198403901576, + "grad_norm": 0.006746775936335325, + "learning_rate": 2.1536176897602245e-06, + "loss": 0.0, + "step": 2373 + }, + { + "epoch": 2.1064065617379737, + "grad_norm": 0.06297089904546738, + "learning_rate": 2.149682926392805e-06, + "loss": 0.0003, + "step": 2374 + }, + { + "epoch": 2.1072932830857902, + "grad_norm": 0.037527166306972504, + "learning_rate": 2.1457507762721792e-06, + "loss": 0.0001, + "step": 2375 + }, + { + "epoch": 2.1081800044336068, + "grad_norm": 0.17847929894924164, + "learning_rate": 2.141821243003443e-06, + "loss": 0.0003, + "step": 2376 + }, + { + "epoch": 2.1090667257814233, + "grad_norm": 0.04266642406582832, + "learning_rate": 2.1378943301893003e-06, + "loss": 0.0001, + "step": 2377 + }, + { + "epoch": 2.10995344712924, + "grad_norm": 0.22053800523281097, + "learning_rate": 2.133970041430044e-06, + "loss": 0.001, + "step": 2378 + }, + { + "epoch": 2.110840168477056, + "grad_norm": 0.0320192314684391, + "learning_rate": 2.130048380323575e-06, + "loss": 0.0001, + "step": 2379 + }, + { + "epoch": 2.1117268898248724, + "grad_norm": 0.0019540840294212103, + "learning_rate": 2.1261293504653707e-06, + "loss": 0.0, + "step": 2380 + }, + { + "epoch": 2.112613611172689, + "grad_norm": 0.0041855634190142155, + "learning_rate": 2.122212955448504e-06, + "loss": 0.0, + "step": 2381 + }, + { + "epoch": 2.1135003325205055, + "grad_norm": 0.028047163039445877, + "learning_rate": 2.118299198863631e-06, + "loss": 0.0001, + "step": 2382 + }, + { + "epoch": 2.114387053868322, + "grad_norm": 0.02840139903128147, + "learning_rate": 2.1143880842989895e-06, + "loss": 0.0001, + "step": 2383 + }, + { + "epoch": 2.115273775216138, + "grad_norm": 0.22898061573505402, + "learning_rate": 2.1104796153403906e-06, + "loss": 0.0012, + "step": 2384 + }, + { + "epoch": 2.1161604965639547, + "grad_norm": 0.0021610562689602375, + "learning_rate": 2.1065737955712244e-06, + "loss": 0.0, + "step": 2385 + }, + { + "epoch": 2.117047217911771, + "grad_norm": 0.003969789016991854, + "learning_rate": 2.102670628572451e-06, + "loss": 0.0, + "step": 2386 + }, + { + "epoch": 2.1179339392595877, + "grad_norm": 0.0022155435290187597, + "learning_rate": 2.0987701179225985e-06, + "loss": 0.0, + "step": 2387 + }, + { + "epoch": 2.1188206606074043, + "grad_norm": 0.008832978084683418, + "learning_rate": 2.0948722671977584e-06, + "loss": 0.0, + "step": 2388 + }, + { + "epoch": 2.1197073819552203, + "grad_norm": 0.027194837108254433, + "learning_rate": 2.0909770799715865e-06, + "loss": 0.0001, + "step": 2389 + }, + { + "epoch": 2.120594103303037, + "grad_norm": 0.0010967071866616607, + "learning_rate": 2.0870845598152907e-06, + "loss": 0.0, + "step": 2390 + }, + { + "epoch": 2.1214808246508534, + "grad_norm": 0.0053475042805075645, + "learning_rate": 2.0831947102976392e-06, + "loss": 0.0, + "step": 2391 + }, + { + "epoch": 2.12236754599867, + "grad_norm": 0.008405130356550217, + "learning_rate": 2.07930753498495e-06, + "loss": 0.0, + "step": 2392 + }, + { + "epoch": 2.1232542673464865, + "grad_norm": 0.005467743147164583, + "learning_rate": 2.0754230374410895e-06, + "loss": 0.0, + "step": 2393 + }, + { + "epoch": 2.124140988694303, + "grad_norm": 0.004836921580135822, + "learning_rate": 2.0715412212274698e-06, + "loss": 0.0, + "step": 2394 + }, + { + "epoch": 2.125027710042119, + "grad_norm": 0.001921136979945004, + "learning_rate": 2.0676620899030393e-06, + "loss": 0.0, + "step": 2395 + }, + { + "epoch": 2.1259144313899356, + "grad_norm": 0.054766010493040085, + "learning_rate": 2.063785647024295e-06, + "loss": 0.0002, + "step": 2396 + }, + { + "epoch": 2.126801152737752, + "grad_norm": 0.02580122835934162, + "learning_rate": 2.0599118961452593e-06, + "loss": 0.0001, + "step": 2397 + }, + { + "epoch": 2.1276878740855687, + "grad_norm": 0.01703774183988571, + "learning_rate": 2.0560408408174925e-06, + "loss": 0.0001, + "step": 2398 + }, + { + "epoch": 2.1285745954333852, + "grad_norm": 0.016674144193530083, + "learning_rate": 2.052172484590077e-06, + "loss": 0.0001, + "step": 2399 + }, + { + "epoch": 2.1294613167812013, + "grad_norm": 0.004239710979163647, + "learning_rate": 2.0483068310096327e-06, + "loss": 0.0, + "step": 2400 + }, + { + "epoch": 2.130348038129018, + "grad_norm": 0.019841166213154793, + "learning_rate": 2.0444438836202872e-06, + "loss": 0.0001, + "step": 2401 + }, + { + "epoch": 2.1312347594768344, + "grad_norm": 0.004029495641589165, + "learning_rate": 2.040583645963697e-06, + "loss": 0.0, + "step": 2402 + }, + { + "epoch": 2.132121480824651, + "grad_norm": 0.005214055068790913, + "learning_rate": 2.0367261215790306e-06, + "loss": 0.0, + "step": 2403 + }, + { + "epoch": 2.1330082021724674, + "grad_norm": 0.02571919932961464, + "learning_rate": 2.032871314002971e-06, + "loss": 0.0001, + "step": 2404 + }, + { + "epoch": 2.133894923520284, + "grad_norm": 0.01948658563196659, + "learning_rate": 2.0290192267697052e-06, + "loss": 0.0, + "step": 2405 + }, + { + "epoch": 2.1347816448681, + "grad_norm": 0.6353626251220703, + "learning_rate": 2.0251698634109314e-06, + "loss": 0.0012, + "step": 2406 + }, + { + "epoch": 2.1356683662159166, + "grad_norm": 0.024343444034457207, + "learning_rate": 2.0213232274558483e-06, + "loss": 0.0001, + "step": 2407 + }, + { + "epoch": 2.136555087563733, + "grad_norm": 0.023843051865696907, + "learning_rate": 2.0174793224311566e-06, + "loss": 0.0001, + "step": 2408 + }, + { + "epoch": 2.1374418089115497, + "grad_norm": 0.025098059326410294, + "learning_rate": 2.0136381518610455e-06, + "loss": 0.0, + "step": 2409 + }, + { + "epoch": 2.138328530259366, + "grad_norm": 0.00488665234297514, + "learning_rate": 2.0097997192672103e-06, + "loss": 0.0, + "step": 2410 + }, + { + "epoch": 2.1392152516071823, + "grad_norm": 0.003173389006406069, + "learning_rate": 2.005964028168823e-06, + "loss": 0.0, + "step": 2411 + }, + { + "epoch": 2.140101972954999, + "grad_norm": 0.0017301311017945409, + "learning_rate": 2.002131082082549e-06, + "loss": 0.0, + "step": 2412 + }, + { + "epoch": 2.1409886943028154, + "grad_norm": 0.4865742623806, + "learning_rate": 1.998300884522536e-06, + "loss": 0.001, + "step": 2413 + }, + { + "epoch": 2.141875415650632, + "grad_norm": 0.011351162567734718, + "learning_rate": 1.994473439000413e-06, + "loss": 0.0001, + "step": 2414 + }, + { + "epoch": 2.1427621369984484, + "grad_norm": 0.003956715576350689, + "learning_rate": 1.9906487490252807e-06, + "loss": 0.0, + "step": 2415 + }, + { + "epoch": 2.1436488583462645, + "grad_norm": 0.01261945441365242, + "learning_rate": 1.9868268181037186e-06, + "loss": 0.0, + "step": 2416 + }, + { + "epoch": 2.144535579694081, + "grad_norm": 0.0019356987904757261, + "learning_rate": 1.9830076497397753e-06, + "loss": 0.0, + "step": 2417 + }, + { + "epoch": 2.1454223010418976, + "grad_norm": 0.0014316060114651918, + "learning_rate": 1.979191247434966e-06, + "loss": 0.0, + "step": 2418 + }, + { + "epoch": 2.146309022389714, + "grad_norm": 0.0029126235749572515, + "learning_rate": 1.9753776146882724e-06, + "loss": 0.0, + "step": 2419 + }, + { + "epoch": 2.1471957437375306, + "grad_norm": 0.049275338649749756, + "learning_rate": 1.97156675499613e-06, + "loss": 0.0001, + "step": 2420 + }, + { + "epoch": 2.1480824650853467, + "grad_norm": 0.0023662822786718607, + "learning_rate": 1.9677586718524422e-06, + "loss": 0.0, + "step": 2421 + }, + { + "epoch": 2.1489691864331633, + "grad_norm": 0.001958158565685153, + "learning_rate": 1.963953368748557e-06, + "loss": 0.0, + "step": 2422 + }, + { + "epoch": 2.14985590778098, + "grad_norm": 0.07051583379507065, + "learning_rate": 1.960150849173279e-06, + "loss": 0.0002, + "step": 2423 + }, + { + "epoch": 2.1507426291287963, + "grad_norm": 0.005282245110720396, + "learning_rate": 1.9563511166128607e-06, + "loss": 0.0, + "step": 2424 + }, + { + "epoch": 2.151629350476613, + "grad_norm": 0.030105119571089745, + "learning_rate": 1.9525541745509997e-06, + "loss": 0.0001, + "step": 2425 + }, + { + "epoch": 2.1525160718244294, + "grad_norm": 0.001675124978646636, + "learning_rate": 1.94876002646883e-06, + "loss": 0.0, + "step": 2426 + }, + { + "epoch": 2.1534027931722455, + "grad_norm": 0.004662784747779369, + "learning_rate": 1.9449686758449298e-06, + "loss": 0.0, + "step": 2427 + }, + { + "epoch": 2.154289514520062, + "grad_norm": 0.033499255776405334, + "learning_rate": 1.941180126155311e-06, + "loss": 0.0001, + "step": 2428 + }, + { + "epoch": 2.1551762358678785, + "grad_norm": 0.009505156427621841, + "learning_rate": 1.937394380873418e-06, + "loss": 0.0, + "step": 2429 + }, + { + "epoch": 2.156062957215695, + "grad_norm": 0.012780265882611275, + "learning_rate": 1.9336114434701186e-06, + "loss": 0.0001, + "step": 2430 + }, + { + "epoch": 2.1569496785635116, + "grad_norm": 0.015024818480014801, + "learning_rate": 1.9298313174137177e-06, + "loss": 0.0001, + "step": 2431 + }, + { + "epoch": 2.1578363999113277, + "grad_norm": 0.2287774533033371, + "learning_rate": 1.92605400616993e-06, + "loss": 0.0004, + "step": 2432 + }, + { + "epoch": 2.1587231212591442, + "grad_norm": 0.0413081981241703, + "learning_rate": 1.9222795132018995e-06, + "loss": 0.0002, + "step": 2433 + }, + { + "epoch": 2.1596098426069608, + "grad_norm": 0.0011024032719433308, + "learning_rate": 1.918507841970177e-06, + "loss": 0.0, + "step": 2434 + }, + { + "epoch": 2.1604965639547773, + "grad_norm": 0.04966243356466293, + "learning_rate": 1.914738995932737e-06, + "loss": 0.0002, + "step": 2435 + }, + { + "epoch": 2.161383285302594, + "grad_norm": 0.29149574041366577, + "learning_rate": 1.9109729785449543e-06, + "loss": 0.0024, + "step": 2436 + }, + { + "epoch": 2.16227000665041, + "grad_norm": 0.004733963403850794, + "learning_rate": 1.9072097932596156e-06, + "loss": 0.0, + "step": 2437 + }, + { + "epoch": 2.1631567279982264, + "grad_norm": 0.07190831005573273, + "learning_rate": 1.9034494435269097e-06, + "loss": 0.0003, + "step": 2438 + }, + { + "epoch": 2.164043449346043, + "grad_norm": 0.0017673807451501489, + "learning_rate": 1.8996919327944252e-06, + "loss": 0.0, + "step": 2439 + }, + { + "epoch": 2.1649301706938595, + "grad_norm": 0.3513320982456207, + "learning_rate": 1.8959372645071506e-06, + "loss": 0.0016, + "step": 2440 + }, + { + "epoch": 2.165816892041676, + "grad_norm": 0.44481173157691956, + "learning_rate": 1.892185442107462e-06, + "loss": 0.0066, + "step": 2441 + }, + { + "epoch": 2.166703613389492, + "grad_norm": 0.015360888093709946, + "learning_rate": 1.888436469035132e-06, + "loss": 0.0, + "step": 2442 + }, + { + "epoch": 2.1675903347373087, + "grad_norm": 0.4169534146785736, + "learning_rate": 1.88469034872732e-06, + "loss": 0.0014, + "step": 2443 + }, + { + "epoch": 2.168477056085125, + "grad_norm": 0.012291036546230316, + "learning_rate": 1.8809470846185684e-06, + "loss": 0.0, + "step": 2444 + }, + { + "epoch": 2.1693637774329417, + "grad_norm": 0.19249936938285828, + "learning_rate": 1.8772066801408017e-06, + "loss": 0.0005, + "step": 2445 + }, + { + "epoch": 2.1702504987807583, + "grad_norm": 0.00133326998911798, + "learning_rate": 1.873469138723325e-06, + "loss": 0.0, + "step": 2446 + }, + { + "epoch": 2.171137220128575, + "grad_norm": 0.00433770939707756, + "learning_rate": 1.8697344637928123e-06, + "loss": 0.0, + "step": 2447 + }, + { + "epoch": 2.172023941476391, + "grad_norm": 0.016662631183862686, + "learning_rate": 1.8660026587733137e-06, + "loss": 0.0, + "step": 2448 + }, + { + "epoch": 2.1729106628242074, + "grad_norm": 0.045981135219335556, + "learning_rate": 1.8622737270862496e-06, + "loss": 0.0002, + "step": 2449 + }, + { + "epoch": 2.173797384172024, + "grad_norm": 0.008949820883572102, + "learning_rate": 1.8585476721504053e-06, + "loss": 0.0, + "step": 2450 + }, + { + "epoch": 2.1746841055198405, + "grad_norm": 0.0555342398583889, + "learning_rate": 1.8548244973819213e-06, + "loss": 0.0001, + "step": 2451 + }, + { + "epoch": 2.175570826867657, + "grad_norm": 0.020420093089342117, + "learning_rate": 1.8511042061943119e-06, + "loss": 0.0, + "step": 2452 + }, + { + "epoch": 2.176457548215473, + "grad_norm": 0.01802026480436325, + "learning_rate": 1.847386801998433e-06, + "loss": 0.0001, + "step": 2453 + }, + { + "epoch": 2.1773442695632896, + "grad_norm": 0.016103439033031464, + "learning_rate": 1.8436722882025043e-06, + "loss": 0.0, + "step": 2454 + }, + { + "epoch": 2.178230990911106, + "grad_norm": 0.46386483311653137, + "learning_rate": 1.8399606682120858e-06, + "loss": 0.0047, + "step": 2455 + }, + { + "epoch": 2.1791177122589227, + "grad_norm": 0.0012606150703504682, + "learning_rate": 1.8362519454300964e-06, + "loss": 0.0, + "step": 2456 + }, + { + "epoch": 2.1800044336067392, + "grad_norm": 0.005236682016402483, + "learning_rate": 1.8325461232567875e-06, + "loss": 0.0, + "step": 2457 + }, + { + "epoch": 2.1808911549545558, + "grad_norm": 0.04357163980603218, + "learning_rate": 1.828843205089757e-06, + "loss": 0.0001, + "step": 2458 + }, + { + "epoch": 2.181777876302372, + "grad_norm": 0.004069244954735041, + "learning_rate": 1.8251431943239396e-06, + "loss": 0.0, + "step": 2459 + }, + { + "epoch": 2.1826645976501884, + "grad_norm": 0.020366471260786057, + "learning_rate": 1.8214460943516055e-06, + "loss": 0.0001, + "step": 2460 + }, + { + "epoch": 2.183551318998005, + "grad_norm": 0.012451727874577045, + "learning_rate": 1.8177519085623513e-06, + "loss": 0.0001, + "step": 2461 + }, + { + "epoch": 2.1844380403458215, + "grad_norm": 0.006223964970558882, + "learning_rate": 1.8140606403431067e-06, + "loss": 0.0, + "step": 2462 + }, + { + "epoch": 2.1853247616936375, + "grad_norm": 0.04250934347510338, + "learning_rate": 1.8103722930781249e-06, + "loss": 0.0002, + "step": 2463 + }, + { + "epoch": 2.186211483041454, + "grad_norm": 0.02039632387459278, + "learning_rate": 1.8066868701489815e-06, + "loss": 0.0001, + "step": 2464 + }, + { + "epoch": 2.1870982043892706, + "grad_norm": 0.021646128967404366, + "learning_rate": 1.80300437493457e-06, + "loss": 0.0001, + "step": 2465 + }, + { + "epoch": 2.187984925737087, + "grad_norm": 0.003513685893267393, + "learning_rate": 1.7993248108111027e-06, + "loss": 0.0, + "step": 2466 + }, + { + "epoch": 2.1888716470849037, + "grad_norm": 0.0011019895318895578, + "learning_rate": 1.7956481811520987e-06, + "loss": 0.0, + "step": 2467 + }, + { + "epoch": 2.18975836843272, + "grad_norm": 0.05865537375211716, + "learning_rate": 1.791974489328392e-06, + "loss": 0.0003, + "step": 2468 + }, + { + "epoch": 2.1906450897805363, + "grad_norm": 0.023978404700756073, + "learning_rate": 1.788303738708121e-06, + "loss": 0.0001, + "step": 2469 + }, + { + "epoch": 2.191531811128353, + "grad_norm": 0.03099576011300087, + "learning_rate": 1.7846359326567274e-06, + "loss": 0.0001, + "step": 2470 + }, + { + "epoch": 2.1924185324761694, + "grad_norm": 0.10269967466592789, + "learning_rate": 1.780971074536957e-06, + "loss": 0.0002, + "step": 2471 + }, + { + "epoch": 2.193305253823986, + "grad_norm": 0.20377202332019806, + "learning_rate": 1.7773091677088427e-06, + "loss": 0.0008, + "step": 2472 + }, + { + "epoch": 2.1941919751718024, + "grad_norm": 0.005082233808934689, + "learning_rate": 1.7736502155297263e-06, + "loss": 0.0, + "step": 2473 + }, + { + "epoch": 2.1950786965196185, + "grad_norm": 0.09815764427185059, + "learning_rate": 1.769994221354227e-06, + "loss": 0.0005, + "step": 2474 + }, + { + "epoch": 2.195965417867435, + "grad_norm": 0.1481843888759613, + "learning_rate": 1.7663411885342625e-06, + "loss": 0.0002, + "step": 2475 + }, + { + "epoch": 2.1968521392152516, + "grad_norm": 0.007009209133684635, + "learning_rate": 1.7626911204190245e-06, + "loss": 0.0, + "step": 2476 + }, + { + "epoch": 2.197738860563068, + "grad_norm": 0.1537850797176361, + "learning_rate": 1.7590440203550009e-06, + "loss": 0.0005, + "step": 2477 + }, + { + "epoch": 2.1986255819108846, + "grad_norm": 0.32946306467056274, + "learning_rate": 1.7553998916859439e-06, + "loss": 0.0017, + "step": 2478 + }, + { + "epoch": 2.199512303258701, + "grad_norm": 0.002217859961092472, + "learning_rate": 1.75175873775289e-06, + "loss": 0.0, + "step": 2479 + }, + { + "epoch": 2.2003990246065173, + "grad_norm": 0.2838335633277893, + "learning_rate": 1.7481205618941472e-06, + "loss": 0.0008, + "step": 2480 + }, + { + "epoch": 2.201285745954334, + "grad_norm": 0.13207651674747467, + "learning_rate": 1.7444853674452932e-06, + "loss": 0.0005, + "step": 2481 + }, + { + "epoch": 2.2021724673021503, + "grad_norm": 0.0635002851486206, + "learning_rate": 1.7408531577391685e-06, + "loss": 0.0002, + "step": 2482 + }, + { + "epoch": 2.203059188649967, + "grad_norm": 0.306825190782547, + "learning_rate": 1.7372239361058813e-06, + "loss": 0.0008, + "step": 2483 + }, + { + "epoch": 2.2039459099977834, + "grad_norm": 0.0018865448655560613, + "learning_rate": 1.7335977058727983e-06, + "loss": 0.0, + "step": 2484 + }, + { + "epoch": 2.2048326313455995, + "grad_norm": 0.00354435364715755, + "learning_rate": 1.7299744703645466e-06, + "loss": 0.0, + "step": 2485 + }, + { + "epoch": 2.205719352693416, + "grad_norm": 0.19610072672367096, + "learning_rate": 1.726354232903002e-06, + "loss": 0.0005, + "step": 2486 + }, + { + "epoch": 2.2066060740412325, + "grad_norm": 0.0014539690455421805, + "learning_rate": 1.722736996807296e-06, + "loss": 0.0, + "step": 2487 + }, + { + "epoch": 2.207492795389049, + "grad_norm": 0.16978108882904053, + "learning_rate": 1.7191227653938081e-06, + "loss": 0.0005, + "step": 2488 + }, + { + "epoch": 2.2083795167368656, + "grad_norm": 0.06128441169857979, + "learning_rate": 1.715511541976161e-06, + "loss": 0.0003, + "step": 2489 + }, + { + "epoch": 2.2092662380846817, + "grad_norm": 0.0005087190656922758, + "learning_rate": 1.711903329865221e-06, + "loss": 0.0, + "step": 2490 + }, + { + "epoch": 2.2101529594324982, + "grad_norm": 0.08252158761024475, + "learning_rate": 1.7082981323690957e-06, + "loss": 0.0002, + "step": 2491 + }, + { + "epoch": 2.2110396807803148, + "grad_norm": 0.0013363012112677097, + "learning_rate": 1.7046959527931224e-06, + "loss": 0.0, + "step": 2492 + }, + { + "epoch": 2.2119264021281313, + "grad_norm": 0.009803982451558113, + "learning_rate": 1.7010967944398777e-06, + "loss": 0.0, + "step": 2493 + }, + { + "epoch": 2.212813123475948, + "grad_norm": 0.00928582064807415, + "learning_rate": 1.6975006606091655e-06, + "loss": 0.0, + "step": 2494 + }, + { + "epoch": 2.213699844823764, + "grad_norm": 0.04216178134083748, + "learning_rate": 1.6939075545980172e-06, + "loss": 0.0001, + "step": 2495 + }, + { + "epoch": 2.2145865661715805, + "grad_norm": 0.012317902408540249, + "learning_rate": 1.690317479700691e-06, + "loss": 0.0, + "step": 2496 + }, + { + "epoch": 2.215473287519397, + "grad_norm": 0.008212944492697716, + "learning_rate": 1.6867304392086575e-06, + "loss": 0.0, + "step": 2497 + }, + { + "epoch": 2.2163600088672135, + "grad_norm": 0.0018377496162429452, + "learning_rate": 1.6831464364106182e-06, + "loss": 0.0, + "step": 2498 + }, + { + "epoch": 2.21724673021503, + "grad_norm": 0.051457684487104416, + "learning_rate": 1.6795654745924772e-06, + "loss": 0.0002, + "step": 2499 + }, + { + "epoch": 2.2181334515628466, + "grad_norm": 0.0026265771593898535, + "learning_rate": 1.6759875570373597e-06, + "loss": 0.0, + "step": 2500 + }, + { + "epoch": 2.2190201729106627, + "grad_norm": 0.8870090842247009, + "learning_rate": 1.6724126870255908e-06, + "loss": 0.0042, + "step": 2501 + }, + { + "epoch": 2.219906894258479, + "grad_norm": 0.002598776947706938, + "learning_rate": 1.6688408678347134e-06, + "loss": 0.0, + "step": 2502 + }, + { + "epoch": 2.2207936156062957, + "grad_norm": 0.004365300294011831, + "learning_rate": 1.6652721027394619e-06, + "loss": 0.0, + "step": 2503 + }, + { + "epoch": 2.2216803369541123, + "grad_norm": 0.0315382219851017, + "learning_rate": 1.6617063950117773e-06, + "loss": 0.0001, + "step": 2504 + }, + { + "epoch": 2.222567058301929, + "grad_norm": 0.5729379653930664, + "learning_rate": 1.6581437479207952e-06, + "loss": 0.0028, + "step": 2505 + }, + { + "epoch": 2.223453779649745, + "grad_norm": 0.02651681751012802, + "learning_rate": 1.654584164732847e-06, + "loss": 0.0001, + "step": 2506 + }, + { + "epoch": 2.2243405009975614, + "grad_norm": 0.00810746569186449, + "learning_rate": 1.6510276487114508e-06, + "loss": 0.0, + "step": 2507 + }, + { + "epoch": 2.225227222345378, + "grad_norm": 0.027231650426983833, + "learning_rate": 1.6474742031173164e-06, + "loss": 0.0001, + "step": 2508 + }, + { + "epoch": 2.2261139436931945, + "grad_norm": 0.4568202495574951, + "learning_rate": 1.643923831208339e-06, + "loss": 0.0009, + "step": 2509 + }, + { + "epoch": 2.227000665041011, + "grad_norm": 0.12720799446105957, + "learning_rate": 1.640376536239594e-06, + "loss": 0.0003, + "step": 2510 + }, + { + "epoch": 2.227887386388827, + "grad_norm": 0.020172515884041786, + "learning_rate": 1.6368323214633331e-06, + "loss": 0.0001, + "step": 2511 + }, + { + "epoch": 2.2287741077366436, + "grad_norm": 0.10302843153476715, + "learning_rate": 1.6332911901289926e-06, + "loss": 0.0003, + "step": 2512 + }, + { + "epoch": 2.22966082908446, + "grad_norm": 0.0018525099148973823, + "learning_rate": 1.6297531454831718e-06, + "loss": 0.0, + "step": 2513 + }, + { + "epoch": 2.2305475504322767, + "grad_norm": 0.009870581328868866, + "learning_rate": 1.6262181907696456e-06, + "loss": 0.0, + "step": 2514 + }, + { + "epoch": 2.2314342717800932, + "grad_norm": 0.0018539935117587447, + "learning_rate": 1.622686329229355e-06, + "loss": 0.0, + "step": 2515 + }, + { + "epoch": 2.2323209931279093, + "grad_norm": 0.1914931684732437, + "learning_rate": 1.6191575641004049e-06, + "loss": 0.0006, + "step": 2516 + }, + { + "epoch": 2.233207714475726, + "grad_norm": 0.10301623493432999, + "learning_rate": 1.6156318986180636e-06, + "loss": 0.0002, + "step": 2517 + }, + { + "epoch": 2.2340944358235424, + "grad_norm": 0.013894562609493732, + "learning_rate": 1.6121093360147517e-06, + "loss": 0.0001, + "step": 2518 + }, + { + "epoch": 2.234981157171359, + "grad_norm": 0.021681584417819977, + "learning_rate": 1.60858987952005e-06, + "loss": 0.0001, + "step": 2519 + }, + { + "epoch": 2.2358678785191755, + "grad_norm": 0.0033847822342067957, + "learning_rate": 1.6050735323606904e-06, + "loss": 0.0, + "step": 2520 + }, + { + "epoch": 2.236754599866992, + "grad_norm": 0.006841412745416164, + "learning_rate": 1.6015602977605555e-06, + "loss": 0.0, + "step": 2521 + }, + { + "epoch": 2.237641321214808, + "grad_norm": 0.031053803861141205, + "learning_rate": 1.5980501789406673e-06, + "loss": 0.0001, + "step": 2522 + }, + { + "epoch": 2.2385280425626246, + "grad_norm": 0.004785686265677214, + "learning_rate": 1.594543179119204e-06, + "loss": 0.0, + "step": 2523 + }, + { + "epoch": 2.239414763910441, + "grad_norm": 0.014937586151063442, + "learning_rate": 1.5910393015114705e-06, + "loss": 0.0, + "step": 2524 + }, + { + "epoch": 2.2403014852582577, + "grad_norm": 0.22507883608341217, + "learning_rate": 1.5875385493299179e-06, + "loss": 0.0023, + "step": 2525 + }, + { + "epoch": 2.241188206606074, + "grad_norm": 0.04809168726205826, + "learning_rate": 1.5840409257841294e-06, + "loss": 0.0002, + "step": 2526 + }, + { + "epoch": 2.2420749279538903, + "grad_norm": 0.018942655995488167, + "learning_rate": 1.5805464340808201e-06, + "loss": 0.0, + "step": 2527 + }, + { + "epoch": 2.242961649301707, + "grad_norm": 0.009502418339252472, + "learning_rate": 1.5770550774238312e-06, + "loss": 0.0, + "step": 2528 + }, + { + "epoch": 2.2438483706495234, + "grad_norm": 0.008832428604364395, + "learning_rate": 1.5735668590141328e-06, + "loss": 0.0, + "step": 2529 + }, + { + "epoch": 2.24473509199734, + "grad_norm": 0.001664695329964161, + "learning_rate": 1.5700817820498165e-06, + "loss": 0.0, + "step": 2530 + }, + { + "epoch": 2.2456218133451564, + "grad_norm": 0.009838556870818138, + "learning_rate": 1.5665998497260959e-06, + "loss": 0.0, + "step": 2531 + }, + { + "epoch": 2.246508534692973, + "grad_norm": 0.04569023847579956, + "learning_rate": 1.5631210652352945e-06, + "loss": 0.0001, + "step": 2532 + }, + { + "epoch": 2.247395256040789, + "grad_norm": 0.005965916905552149, + "learning_rate": 1.5596454317668613e-06, + "loss": 0.0, + "step": 2533 + }, + { + "epoch": 2.2482819773886056, + "grad_norm": 0.010405507870018482, + "learning_rate": 1.5561729525073454e-06, + "loss": 0.0, + "step": 2534 + }, + { + "epoch": 2.249168698736422, + "grad_norm": 0.2523658573627472, + "learning_rate": 1.552703630640411e-06, + "loss": 0.0012, + "step": 2535 + }, + { + "epoch": 2.2500554200842386, + "grad_norm": 0.00390112167224288, + "learning_rate": 1.5492374693468204e-06, + "loss": 0.0, + "step": 2536 + }, + { + "epoch": 2.2509421414320547, + "grad_norm": 0.010760684497654438, + "learning_rate": 1.5457744718044493e-06, + "loss": 0.0, + "step": 2537 + }, + { + "epoch": 2.2518288627798713, + "grad_norm": 0.0015860433923080564, + "learning_rate": 1.5423146411882616e-06, + "loss": 0.0, + "step": 2538 + }, + { + "epoch": 2.252715584127688, + "grad_norm": 0.019439872354269028, + "learning_rate": 1.5388579806703241e-06, + "loss": 0.0001, + "step": 2539 + }, + { + "epoch": 2.2536023054755043, + "grad_norm": 0.017621183767914772, + "learning_rate": 1.5354044934197953e-06, + "loss": 0.0001, + "step": 2540 + }, + { + "epoch": 2.254489026823321, + "grad_norm": 0.03982497379183769, + "learning_rate": 1.5319541826029243e-06, + "loss": 0.0002, + "step": 2541 + }, + { + "epoch": 2.2553757481711374, + "grad_norm": 0.0021173080895096064, + "learning_rate": 1.5285070513830502e-06, + "loss": 0.0, + "step": 2542 + }, + { + "epoch": 2.2562624695189535, + "grad_norm": 0.008100396953523159, + "learning_rate": 1.525063102920591e-06, + "loss": 0.0, + "step": 2543 + }, + { + "epoch": 2.25714919086677, + "grad_norm": 0.014582299627363682, + "learning_rate": 1.521622340373053e-06, + "loss": 0.0001, + "step": 2544 + }, + { + "epoch": 2.2580359122145865, + "grad_norm": 0.06363100558519363, + "learning_rate": 1.518184766895019e-06, + "loss": 0.0001, + "step": 2545 + }, + { + "epoch": 2.258922633562403, + "grad_norm": 0.06486165523529053, + "learning_rate": 1.5147503856381478e-06, + "loss": 0.0002, + "step": 2546 + }, + { + "epoch": 2.2598093549102196, + "grad_norm": 0.008981321938335896, + "learning_rate": 1.5113191997511716e-06, + "loss": 0.0, + "step": 2547 + }, + { + "epoch": 2.2606960762580357, + "grad_norm": 0.17823724448680878, + "learning_rate": 1.507891212379896e-06, + "loss": 0.0006, + "step": 2548 + }, + { + "epoch": 2.2615827976058522, + "grad_norm": 0.02821299619972706, + "learning_rate": 1.504466426667187e-06, + "loss": 0.0001, + "step": 2549 + }, + { + "epoch": 2.2624695189536688, + "grad_norm": 0.0017538291867822409, + "learning_rate": 1.5010448457529814e-06, + "loss": 0.0, + "step": 2550 + }, + { + "epoch": 2.2633562403014853, + "grad_norm": 0.0058082821778953075, + "learning_rate": 1.4976264727742768e-06, + "loss": 0.0, + "step": 2551 + }, + { + "epoch": 2.264242961649302, + "grad_norm": 0.10759159922599792, + "learning_rate": 1.4942113108651291e-06, + "loss": 0.0003, + "step": 2552 + }, + { + "epoch": 2.2651296829971184, + "grad_norm": 0.058488212525844574, + "learning_rate": 1.490799363156646e-06, + "loss": 0.0002, + "step": 2553 + }, + { + "epoch": 2.2660164043449345, + "grad_norm": 0.011463595554232597, + "learning_rate": 1.4873906327769978e-06, + "loss": 0.0, + "step": 2554 + }, + { + "epoch": 2.266903125692751, + "grad_norm": 0.0010025546653196216, + "learning_rate": 1.4839851228513957e-06, + "loss": 0.0, + "step": 2555 + }, + { + "epoch": 2.2677898470405675, + "grad_norm": 0.002267697127535939, + "learning_rate": 1.4805828365021047e-06, + "loss": 0.0, + "step": 2556 + }, + { + "epoch": 2.268676568388384, + "grad_norm": 0.008600475266575813, + "learning_rate": 1.4771837768484276e-06, + "loss": 0.0, + "step": 2557 + }, + { + "epoch": 2.2695632897362006, + "grad_norm": 0.4348275363445282, + "learning_rate": 1.4737879470067184e-06, + "loss": 0.0024, + "step": 2558 + }, + { + "epoch": 2.2704500110840167, + "grad_norm": 0.0017552981153130531, + "learning_rate": 1.4703953500903613e-06, + "loss": 0.0, + "step": 2559 + }, + { + "epoch": 2.271336732431833, + "grad_norm": 0.0007721121073700488, + "learning_rate": 1.4670059892097793e-06, + "loss": 0.0, + "step": 2560 + }, + { + "epoch": 2.2722234537796497, + "grad_norm": 0.037658073008060455, + "learning_rate": 1.4636198674724305e-06, + "loss": 0.0001, + "step": 2561 + }, + { + "epoch": 2.2731101751274663, + "grad_norm": 0.017900565639138222, + "learning_rate": 1.4602369879828033e-06, + "loss": 0.0001, + "step": 2562 + }, + { + "epoch": 2.273996896475283, + "grad_norm": 0.025672055780887604, + "learning_rate": 1.4568573538424075e-06, + "loss": 0.0001, + "step": 2563 + }, + { + "epoch": 2.2748836178230993, + "grad_norm": 0.7328742742538452, + "learning_rate": 1.4534809681497846e-06, + "loss": 0.0042, + "step": 2564 + }, + { + "epoch": 2.2757703391709154, + "grad_norm": 0.02617855742573738, + "learning_rate": 1.4501078340004954e-06, + "loss": 0.0001, + "step": 2565 + }, + { + "epoch": 2.276657060518732, + "grad_norm": 0.014606747776269913, + "learning_rate": 1.446737954487118e-06, + "loss": 0.0001, + "step": 2566 + }, + { + "epoch": 2.2775437818665485, + "grad_norm": 0.0028215658385306597, + "learning_rate": 1.4433713326992488e-06, + "loss": 0.0, + "step": 2567 + }, + { + "epoch": 2.278430503214365, + "grad_norm": 0.06955783069133759, + "learning_rate": 1.4400079717234983e-06, + "loss": 0.0001, + "step": 2568 + }, + { + "epoch": 2.279317224562181, + "grad_norm": 0.0066056400537490845, + "learning_rate": 1.436647874643482e-06, + "loss": 0.0, + "step": 2569 + }, + { + "epoch": 2.2802039459099976, + "grad_norm": 0.0011473114136606455, + "learning_rate": 1.4332910445398285e-06, + "loss": 0.0, + "step": 2570 + }, + { + "epoch": 2.281090667257814, + "grad_norm": 0.07209219038486481, + "learning_rate": 1.4299374844901692e-06, + "loss": 0.0002, + "step": 2571 + }, + { + "epoch": 2.2819773886056307, + "grad_norm": 0.0020093126222491264, + "learning_rate": 1.426587197569137e-06, + "loss": 0.0, + "step": 2572 + }, + { + "epoch": 2.2828641099534472, + "grad_norm": 0.002489017089828849, + "learning_rate": 1.4232401868483669e-06, + "loss": 0.0, + "step": 2573 + }, + { + "epoch": 2.2837508313012638, + "grad_norm": 0.006025847978889942, + "learning_rate": 1.4198964553964823e-06, + "loss": 0.0, + "step": 2574 + }, + { + "epoch": 2.28463755264908, + "grad_norm": 0.00035993586061522365, + "learning_rate": 1.4165560062791117e-06, + "loss": 0.0, + "step": 2575 + }, + { + "epoch": 2.2855242739968964, + "grad_norm": 0.005870227701961994, + "learning_rate": 1.4132188425588633e-06, + "loss": 0.0, + "step": 2576 + }, + { + "epoch": 2.286410995344713, + "grad_norm": 0.009169611148536205, + "learning_rate": 1.4098849672953418e-06, + "loss": 0.0, + "step": 2577 + }, + { + "epoch": 2.2872977166925295, + "grad_norm": 0.008256441913545132, + "learning_rate": 1.4065543835451278e-06, + "loss": 0.0, + "step": 2578 + }, + { + "epoch": 2.288184438040346, + "grad_norm": 0.03790901228785515, + "learning_rate": 1.4032270943617953e-06, + "loss": 0.0001, + "step": 2579 + }, + { + "epoch": 2.289071159388162, + "grad_norm": 0.07731454819440842, + "learning_rate": 1.3999031027958876e-06, + "loss": 0.0003, + "step": 2580 + }, + { + "epoch": 2.2899578807359786, + "grad_norm": 0.018880745396018028, + "learning_rate": 1.3965824118949307e-06, + "loss": 0.0001, + "step": 2581 + }, + { + "epoch": 2.290844602083795, + "grad_norm": 0.40576595067977905, + "learning_rate": 1.393265024703422e-06, + "loss": 0.0011, + "step": 2582 + }, + { + "epoch": 2.2917313234316117, + "grad_norm": 0.002390408655628562, + "learning_rate": 1.3899509442628328e-06, + "loss": 0.0, + "step": 2583 + }, + { + "epoch": 2.292618044779428, + "grad_norm": 0.11626479774713516, + "learning_rate": 1.3866401736115975e-06, + "loss": 0.0004, + "step": 2584 + }, + { + "epoch": 2.2935047661272447, + "grad_norm": 0.0015793382190167904, + "learning_rate": 1.3833327157851194e-06, + "loss": 0.0, + "step": 2585 + }, + { + "epoch": 2.294391487475061, + "grad_norm": 0.012690646573901176, + "learning_rate": 1.380028573815766e-06, + "loss": 0.0, + "step": 2586 + }, + { + "epoch": 2.2952782088228774, + "grad_norm": 0.029712093994021416, + "learning_rate": 1.376727750732863e-06, + "loss": 0.0001, + "step": 2587 + }, + { + "epoch": 2.296164930170694, + "grad_norm": 0.002025979571044445, + "learning_rate": 1.3734302495626905e-06, + "loss": 0.0, + "step": 2588 + }, + { + "epoch": 2.2970516515185104, + "grad_norm": 0.012119759805500507, + "learning_rate": 1.370136073328487e-06, + "loss": 0.0, + "step": 2589 + }, + { + "epoch": 2.2979383728663265, + "grad_norm": 0.10140902549028397, + "learning_rate": 1.3668452250504417e-06, + "loss": 0.0002, + "step": 2590 + }, + { + "epoch": 2.298825094214143, + "grad_norm": 0.007041087839752436, + "learning_rate": 1.3635577077456913e-06, + "loss": 0.0, + "step": 2591 + }, + { + "epoch": 2.2997118155619596, + "grad_norm": 0.004575217142701149, + "learning_rate": 1.3602735244283204e-06, + "loss": 0.0, + "step": 2592 + }, + { + "epoch": 2.300598536909776, + "grad_norm": 0.01836671680212021, + "learning_rate": 1.3569926781093568e-06, + "loss": 0.0001, + "step": 2593 + }, + { + "epoch": 2.3014852582575926, + "grad_norm": 0.010562287643551826, + "learning_rate": 1.353715171796765e-06, + "loss": 0.0, + "step": 2594 + }, + { + "epoch": 2.302371979605409, + "grad_norm": 0.07208259403705597, + "learning_rate": 1.3504410084954521e-06, + "loss": 0.0004, + "step": 2595 + }, + { + "epoch": 2.3032587009532253, + "grad_norm": 0.0030105796176940203, + "learning_rate": 1.3471701912072581e-06, + "loss": 0.0, + "step": 2596 + }, + { + "epoch": 2.304145422301042, + "grad_norm": 0.001862941775470972, + "learning_rate": 1.3439027229309564e-06, + "loss": 0.0, + "step": 2597 + }, + { + "epoch": 2.3050321436488583, + "grad_norm": 0.02082369290292263, + "learning_rate": 1.3406386066622495e-06, + "loss": 0.0, + "step": 2598 + }, + { + "epoch": 2.305918864996675, + "grad_norm": 0.13240396976470947, + "learning_rate": 1.337377845393763e-06, + "loss": 0.0004, + "step": 2599 + }, + { + "epoch": 2.3068055863444914, + "grad_norm": 0.0022385409101843834, + "learning_rate": 1.334120442115055e-06, + "loss": 0.0, + "step": 2600 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.5025140643119812, + "learning_rate": 1.3308663998125954e-06, + "loss": 0.0013, + "step": 2601 + }, + { + "epoch": 2.308579029040124, + "grad_norm": 0.06153545901179314, + "learning_rate": 1.3276157214697805e-06, + "loss": 0.0002, + "step": 2602 + }, + { + "epoch": 2.3094657503879406, + "grad_norm": 0.007376537658274174, + "learning_rate": 1.324368410066914e-06, + "loss": 0.0, + "step": 2603 + }, + { + "epoch": 2.310352471735757, + "grad_norm": 0.04724806919693947, + "learning_rate": 1.321124468581224e-06, + "loss": 0.0002, + "step": 2604 + }, + { + "epoch": 2.3112391930835736, + "grad_norm": 0.10160135477781296, + "learning_rate": 1.3178838999868381e-06, + "loss": 0.0004, + "step": 2605 + }, + { + "epoch": 2.31212591443139, + "grad_norm": 0.0051342276856303215, + "learning_rate": 1.3146467072547975e-06, + "loss": 0.0, + "step": 2606 + }, + { + "epoch": 2.3130126357792062, + "grad_norm": 0.009772558696568012, + "learning_rate": 1.3114128933530473e-06, + "loss": 0.0001, + "step": 2607 + }, + { + "epoch": 2.3138993571270228, + "grad_norm": 0.04966363310813904, + "learning_rate": 1.308182461246435e-06, + "loss": 0.0001, + "step": 2608 + }, + { + "epoch": 2.3147860784748393, + "grad_norm": 0.0012651075376197696, + "learning_rate": 1.3049554138967052e-06, + "loss": 0.0, + "step": 2609 + }, + { + "epoch": 2.315672799822656, + "grad_norm": 0.48004817962646484, + "learning_rate": 1.3017317542625018e-06, + "loss": 0.0029, + "step": 2610 + }, + { + "epoch": 2.3165595211704724, + "grad_norm": 0.27828601002693176, + "learning_rate": 1.2985114852993624e-06, + "loss": 0.0004, + "step": 2611 + }, + { + "epoch": 2.3174462425182885, + "grad_norm": 0.3281623423099518, + "learning_rate": 1.2952946099597169e-06, + "loss": 0.0009, + "step": 2612 + }, + { + "epoch": 2.318332963866105, + "grad_norm": 0.01670690067112446, + "learning_rate": 1.2920811311928778e-06, + "loss": 0.0001, + "step": 2613 + }, + { + "epoch": 2.3192196852139215, + "grad_norm": 0.002526923082768917, + "learning_rate": 1.2888710519450543e-06, + "loss": 0.0, + "step": 2614 + }, + { + "epoch": 2.320106406561738, + "grad_norm": 0.027562735602259636, + "learning_rate": 1.2856643751593285e-06, + "loss": 0.0001, + "step": 2615 + }, + { + "epoch": 2.3209931279095546, + "grad_norm": 0.3966238498687744, + "learning_rate": 1.2824611037756686e-06, + "loss": 0.0032, + "step": 2616 + }, + { + "epoch": 2.321879849257371, + "grad_norm": 0.23547692596912384, + "learning_rate": 1.2792612407309195e-06, + "loss": 0.0006, + "step": 2617 + }, + { + "epoch": 2.322766570605187, + "grad_norm": 0.07173073291778564, + "learning_rate": 1.2760647889588008e-06, + "loss": 0.0002, + "step": 2618 + }, + { + "epoch": 2.3236532919530037, + "grad_norm": 0.000607346068136394, + "learning_rate": 1.2728717513899065e-06, + "loss": 0.0, + "step": 2619 + }, + { + "epoch": 2.3245400133008203, + "grad_norm": 0.03587135300040245, + "learning_rate": 1.2696821309516954e-06, + "loss": 0.0001, + "step": 2620 + }, + { + "epoch": 2.325426734648637, + "grad_norm": 0.01693054474890232, + "learning_rate": 1.2664959305684988e-06, + "loss": 0.0001, + "step": 2621 + }, + { + "epoch": 2.326313455996453, + "grad_norm": 0.16542497277259827, + "learning_rate": 1.2633131531615101e-06, + "loss": 0.0002, + "step": 2622 + }, + { + "epoch": 2.3272001773442694, + "grad_norm": 0.03863265737891197, + "learning_rate": 1.2601338016487852e-06, + "loss": 0.0001, + "step": 2623 + }, + { + "epoch": 2.328086898692086, + "grad_norm": 0.16595076024532318, + "learning_rate": 1.2569578789452353e-06, + "loss": 0.0002, + "step": 2624 + }, + { + "epoch": 2.3289736200399025, + "grad_norm": 0.004780843388289213, + "learning_rate": 1.253785387962636e-06, + "loss": 0.0, + "step": 2625 + }, + { + "epoch": 2.329860341387719, + "grad_norm": 0.5518388152122498, + "learning_rate": 1.2506163316096081e-06, + "loss": 0.0053, + "step": 2626 + }, + { + "epoch": 2.3307470627355356, + "grad_norm": 0.09815961122512817, + "learning_rate": 1.247450712791628e-06, + "loss": 0.0004, + "step": 2627 + }, + { + "epoch": 2.3316337840833516, + "grad_norm": 0.016248105093836784, + "learning_rate": 1.24428853441102e-06, + "loss": 0.0, + "step": 2628 + }, + { + "epoch": 2.332520505431168, + "grad_norm": 0.13292931020259857, + "learning_rate": 1.2411297993669546e-06, + "loss": 0.0004, + "step": 2629 + }, + { + "epoch": 2.3334072267789847, + "grad_norm": 0.45038118958473206, + "learning_rate": 1.2379745105554413e-06, + "loss": 0.0055, + "step": 2630 + }, + { + "epoch": 2.3342939481268012, + "grad_norm": 0.009716760367155075, + "learning_rate": 1.2348226708693356e-06, + "loss": 0.0, + "step": 2631 + }, + { + "epoch": 2.3351806694746178, + "grad_norm": 0.29626765847206116, + "learning_rate": 1.231674283198328e-06, + "loss": 0.001, + "step": 2632 + }, + { + "epoch": 2.336067390822434, + "grad_norm": 0.0017720351461321115, + "learning_rate": 1.2285293504289448e-06, + "loss": 0.0, + "step": 2633 + }, + { + "epoch": 2.3369541121702504, + "grad_norm": 0.0024435853119939566, + "learning_rate": 1.2253878754445414e-06, + "loss": 0.0, + "step": 2634 + }, + { + "epoch": 2.337840833518067, + "grad_norm": 0.04254931956529617, + "learning_rate": 1.2222498611253103e-06, + "loss": 0.0001, + "step": 2635 + }, + { + "epoch": 2.3387275548658835, + "grad_norm": 0.28088611364364624, + "learning_rate": 1.219115310348264e-06, + "loss": 0.0023, + "step": 2636 + }, + { + "epoch": 2.3396142762137, + "grad_norm": 0.000748008256778121, + "learning_rate": 1.215984225987244e-06, + "loss": 0.0, + "step": 2637 + }, + { + "epoch": 2.3405009975615165, + "grad_norm": 0.0008762306533753872, + "learning_rate": 1.212856610912908e-06, + "loss": 0.0, + "step": 2638 + }, + { + "epoch": 2.3413877189093326, + "grad_norm": 0.03883393481373787, + "learning_rate": 1.2097324679927431e-06, + "loss": 0.0001, + "step": 2639 + }, + { + "epoch": 2.342274440257149, + "grad_norm": 0.0028116086032241583, + "learning_rate": 1.2066118000910421e-06, + "loss": 0.0, + "step": 2640 + }, + { + "epoch": 2.3431611616049657, + "grad_norm": 0.02033114992082119, + "learning_rate": 1.2034946100689188e-06, + "loss": 0.0001, + "step": 2641 + }, + { + "epoch": 2.344047882952782, + "grad_norm": 0.036860495805740356, + "learning_rate": 1.200380900784296e-06, + "loss": 0.0002, + "step": 2642 + }, + { + "epoch": 2.3449346043005983, + "grad_norm": 0.00874006561934948, + "learning_rate": 1.197270675091905e-06, + "loss": 0.0, + "step": 2643 + }, + { + "epoch": 2.345821325648415, + "grad_norm": 0.005195686127990484, + "learning_rate": 1.1941639358432855e-06, + "loss": 0.0, + "step": 2644 + }, + { + "epoch": 2.3467080469962314, + "grad_norm": 0.005843705963343382, + "learning_rate": 1.1910606858867763e-06, + "loss": 0.0, + "step": 2645 + }, + { + "epoch": 2.347594768344048, + "grad_norm": 0.0016122246161103249, + "learning_rate": 1.1879609280675208e-06, + "loss": 0.0, + "step": 2646 + }, + { + "epoch": 2.3484814896918644, + "grad_norm": 0.01298630889505148, + "learning_rate": 1.1848646652274604e-06, + "loss": 0.0001, + "step": 2647 + }, + { + "epoch": 2.349368211039681, + "grad_norm": 0.004974827636033297, + "learning_rate": 1.1817719002053307e-06, + "loss": 0.0, + "step": 2648 + }, + { + "epoch": 2.350254932387497, + "grad_norm": 0.009422971867024899, + "learning_rate": 1.1786826358366621e-06, + "loss": 0.0, + "step": 2649 + }, + { + "epoch": 2.3511416537353136, + "grad_norm": 0.0029839402996003628, + "learning_rate": 1.1755968749537755e-06, + "loss": 0.0, + "step": 2650 + }, + { + "epoch": 2.35202837508313, + "grad_norm": 0.043002091348171234, + "learning_rate": 1.1725146203857768e-06, + "loss": 0.0001, + "step": 2651 + }, + { + "epoch": 2.3529150964309467, + "grad_norm": 0.010507393628358841, + "learning_rate": 1.1694358749585593e-06, + "loss": 0.0, + "step": 2652 + }, + { + "epoch": 2.353801817778763, + "grad_norm": 0.21527473628520966, + "learning_rate": 1.1663606414947998e-06, + "loss": 0.0005, + "step": 2653 + }, + { + "epoch": 2.3546885391265793, + "grad_norm": 0.021156540140509605, + "learning_rate": 1.1632889228139565e-06, + "loss": 0.0001, + "step": 2654 + }, + { + "epoch": 2.355575260474396, + "grad_norm": 0.010046728886663914, + "learning_rate": 1.1602207217322575e-06, + "loss": 0.0, + "step": 2655 + }, + { + "epoch": 2.3564619818222123, + "grad_norm": 0.0030884298030287027, + "learning_rate": 1.1571560410627181e-06, + "loss": 0.0, + "step": 2656 + }, + { + "epoch": 2.357348703170029, + "grad_norm": 0.02951064519584179, + "learning_rate": 1.154094883615115e-06, + "loss": 0.0001, + "step": 2657 + }, + { + "epoch": 2.3582354245178454, + "grad_norm": 0.028621038421988487, + "learning_rate": 1.1510372521960016e-06, + "loss": 0.0001, + "step": 2658 + }, + { + "epoch": 2.359122145865662, + "grad_norm": 0.09628434479236603, + "learning_rate": 1.1479831496086919e-06, + "loss": 0.0004, + "step": 2659 + }, + { + "epoch": 2.360008867213478, + "grad_norm": 0.007439910434186459, + "learning_rate": 1.1449325786532744e-06, + "loss": 0.0, + "step": 2660 + }, + { + "epoch": 2.3608955885612946, + "grad_norm": 0.12967965006828308, + "learning_rate": 1.1418855421265896e-06, + "loss": 0.0005, + "step": 2661 + }, + { + "epoch": 2.361782309909111, + "grad_norm": 0.2929927110671997, + "learning_rate": 1.1388420428222435e-06, + "loss": 0.0007, + "step": 2662 + }, + { + "epoch": 2.3626690312569276, + "grad_norm": 0.0033991162199527025, + "learning_rate": 1.1358020835305978e-06, + "loss": 0.0, + "step": 2663 + }, + { + "epoch": 2.363555752604744, + "grad_norm": 0.0073296381160616875, + "learning_rate": 1.1327656670387694e-06, + "loss": 0.0, + "step": 2664 + }, + { + "epoch": 2.3644424739525602, + "grad_norm": 0.017435623332858086, + "learning_rate": 1.1297327961306226e-06, + "loss": 0.0, + "step": 2665 + }, + { + "epoch": 2.3653291953003768, + "grad_norm": 0.002517417771741748, + "learning_rate": 1.1267034735867771e-06, + "loss": 0.0, + "step": 2666 + }, + { + "epoch": 2.3662159166481933, + "grad_norm": 0.0027054729871451855, + "learning_rate": 1.1236777021845957e-06, + "loss": 0.0, + "step": 2667 + }, + { + "epoch": 2.36710263799601, + "grad_norm": 0.10869599133729935, + "learning_rate": 1.1206554846981866e-06, + "loss": 0.0005, + "step": 2668 + }, + { + "epoch": 2.3679893593438264, + "grad_norm": 0.02760649099946022, + "learning_rate": 1.1176368238983997e-06, + "loss": 0.0001, + "step": 2669 + }, + { + "epoch": 2.3688760806916425, + "grad_norm": 0.008239630609750748, + "learning_rate": 1.1146217225528255e-06, + "loss": 0.0, + "step": 2670 + }, + { + "epoch": 2.369762802039459, + "grad_norm": 0.0009814087534323335, + "learning_rate": 1.111610183425786e-06, + "loss": 0.0, + "step": 2671 + }, + { + "epoch": 2.3706495233872755, + "grad_norm": 0.010030284523963928, + "learning_rate": 1.1086022092783422e-06, + "loss": 0.0, + "step": 2672 + }, + { + "epoch": 2.371536244735092, + "grad_norm": 0.0005799742066301405, + "learning_rate": 1.1055978028682858e-06, + "loss": 0.0, + "step": 2673 + }, + { + "epoch": 2.3724229660829086, + "grad_norm": 0.01730470359325409, + "learning_rate": 1.1025969669501357e-06, + "loss": 0.0001, + "step": 2674 + }, + { + "epoch": 2.3733096874307247, + "grad_norm": 0.14865989983081818, + "learning_rate": 1.09959970427514e-06, + "loss": 0.0004, + "step": 2675 + }, + { + "epoch": 2.374196408778541, + "grad_norm": 0.31325671076774597, + "learning_rate": 1.0966060175912658e-06, + "loss": 0.0007, + "step": 2676 + }, + { + "epoch": 2.3750831301263577, + "grad_norm": 0.01875937730073929, + "learning_rate": 1.0936159096432097e-06, + "loss": 0.0001, + "step": 2677 + }, + { + "epoch": 2.3759698514741743, + "grad_norm": 0.05218518152832985, + "learning_rate": 1.090629383172379e-06, + "loss": 0.0001, + "step": 2678 + }, + { + "epoch": 2.376856572821991, + "grad_norm": 0.03152287006378174, + "learning_rate": 1.0876464409169029e-06, + "loss": 0.0001, + "step": 2679 + }, + { + "epoch": 2.3777432941698073, + "grad_norm": 0.001231821603141725, + "learning_rate": 1.0846670856116193e-06, + "loss": 0.0, + "step": 2680 + }, + { + "epoch": 2.3786300155176234, + "grad_norm": 0.0006567314267158508, + "learning_rate": 1.0816913199880852e-06, + "loss": 0.0, + "step": 2681 + }, + { + "epoch": 2.37951673686544, + "grad_norm": 0.0030319185461848974, + "learning_rate": 1.0787191467745584e-06, + "loss": 0.0, + "step": 2682 + }, + { + "epoch": 2.3804034582132565, + "grad_norm": 0.010536277666687965, + "learning_rate": 1.075750568696008e-06, + "loss": 0.0, + "step": 2683 + }, + { + "epoch": 2.381290179561073, + "grad_norm": 0.07841084152460098, + "learning_rate": 1.0727855884741057e-06, + "loss": 0.0003, + "step": 2684 + }, + { + "epoch": 2.3821769009088896, + "grad_norm": 0.005802239757031202, + "learning_rate": 1.0698242088272253e-06, + "loss": 0.0, + "step": 2685 + }, + { + "epoch": 2.3830636222567056, + "grad_norm": 0.019029144197702408, + "learning_rate": 1.066866432470437e-06, + "loss": 0.0001, + "step": 2686 + }, + { + "epoch": 2.383950343604522, + "grad_norm": 0.0072951787151396275, + "learning_rate": 1.0639122621155102e-06, + "loss": 0.0, + "step": 2687 + }, + { + "epoch": 2.3848370649523387, + "grad_norm": 0.0025982209481298923, + "learning_rate": 1.0609617004709061e-06, + "loss": 0.0, + "step": 2688 + }, + { + "epoch": 2.3857237863001552, + "grad_norm": 0.001721179811283946, + "learning_rate": 1.058014750241781e-06, + "loss": 0.0, + "step": 2689 + }, + { + "epoch": 2.386610507647972, + "grad_norm": 0.006247865501791239, + "learning_rate": 1.055071414129973e-06, + "loss": 0.0, + "step": 2690 + }, + { + "epoch": 2.3874972289957883, + "grad_norm": 0.0005024709389545023, + "learning_rate": 1.0521316948340166e-06, + "loss": 0.0, + "step": 2691 + }, + { + "epoch": 2.3883839503436044, + "grad_norm": 0.011277176439762115, + "learning_rate": 1.0491955950491212e-06, + "loss": 0.0, + "step": 2692 + }, + { + "epoch": 2.389270671691421, + "grad_norm": 0.0024250997230410576, + "learning_rate": 1.0462631174671823e-06, + "loss": 0.0, + "step": 2693 + }, + { + "epoch": 2.3901573930392375, + "grad_norm": 0.10233346372842789, + "learning_rate": 1.0433342647767735e-06, + "loss": 0.0004, + "step": 2694 + }, + { + "epoch": 2.391044114387054, + "grad_norm": 0.0037988275289535522, + "learning_rate": 1.0404090396631477e-06, + "loss": 0.0, + "step": 2695 + }, + { + "epoch": 2.39193083573487, + "grad_norm": 0.003683975664898753, + "learning_rate": 1.0374874448082257e-06, + "loss": 0.0, + "step": 2696 + }, + { + "epoch": 2.3928175570826866, + "grad_norm": 0.371255487203598, + "learning_rate": 1.034569482890605e-06, + "loss": 0.0012, + "step": 2697 + }, + { + "epoch": 2.393704278430503, + "grad_norm": 0.2531975209712982, + "learning_rate": 1.0316551565855515e-06, + "loss": 0.0008, + "step": 2698 + }, + { + "epoch": 2.3945909997783197, + "grad_norm": 0.008249549195170403, + "learning_rate": 1.0287444685649968e-06, + "loss": 0.0, + "step": 2699 + }, + { + "epoch": 2.395477721126136, + "grad_norm": 0.004005208145827055, + "learning_rate": 1.0258374214975386e-06, + "loss": 0.0, + "step": 2700 + }, + { + "epoch": 2.3963644424739527, + "grad_norm": 0.010472718626260757, + "learning_rate": 1.022934018048432e-06, + "loss": 0.0, + "step": 2701 + }, + { + "epoch": 2.397251163821769, + "grad_norm": 0.027334703132510185, + "learning_rate": 1.0200342608795998e-06, + "loss": 0.0001, + "step": 2702 + }, + { + "epoch": 2.3981378851695854, + "grad_norm": 0.0051247309893369675, + "learning_rate": 1.0171381526496132e-06, + "loss": 0.0, + "step": 2703 + }, + { + "epoch": 2.399024606517402, + "grad_norm": 0.6095221638679504, + "learning_rate": 1.0142456960137032e-06, + "loss": 0.0016, + "step": 2704 + }, + { + "epoch": 2.3999113278652184, + "grad_norm": 0.1208229809999466, + "learning_rate": 1.0113568936237478e-06, + "loss": 0.0001, + "step": 2705 + }, + { + "epoch": 2.400798049213035, + "grad_norm": 0.0789174810051918, + "learning_rate": 1.0084717481282842e-06, + "loss": 0.0002, + "step": 2706 + }, + { + "epoch": 2.401684770560851, + "grad_norm": 0.02274036966264248, + "learning_rate": 1.0055902621724855e-06, + "loss": 0.0001, + "step": 2707 + }, + { + "epoch": 2.4025714919086676, + "grad_norm": 0.011838889680802822, + "learning_rate": 1.0027124383981768e-06, + "loss": 0.0, + "step": 2708 + }, + { + "epoch": 2.403458213256484, + "grad_norm": 0.11658446490764618, + "learning_rate": 9.998382794438243e-07, + "loss": 0.0004, + "step": 2709 + }, + { + "epoch": 2.4043449346043007, + "grad_norm": 0.002783681731671095, + "learning_rate": 9.969677879445344e-07, + "loss": 0.0, + "step": 2710 + }, + { + "epoch": 2.405231655952117, + "grad_norm": 0.0004409880784805864, + "learning_rate": 9.941009665320483e-07, + "loss": 0.0, + "step": 2711 + }, + { + "epoch": 2.4061183772999337, + "grad_norm": 0.23596900701522827, + "learning_rate": 9.912378178347448e-07, + "loss": 0.0012, + "step": 2712 + }, + { + "epoch": 2.40700509864775, + "grad_norm": 0.1281604915857315, + "learning_rate": 9.883783444776357e-07, + "loss": 0.0008, + "step": 2713 + }, + { + "epoch": 2.4078918199955663, + "grad_norm": 0.0167441014200449, + "learning_rate": 9.855225490823639e-07, + "loss": 0.0001, + "step": 2714 + }, + { + "epoch": 2.408778541343383, + "grad_norm": 0.009180407971143723, + "learning_rate": 9.826704342671945e-07, + "loss": 0.0, + "step": 2715 + }, + { + "epoch": 2.4096652626911994, + "grad_norm": 0.020933065563440323, + "learning_rate": 9.798220026470285e-07, + "loss": 0.0001, + "step": 2716 + }, + { + "epoch": 2.4105519840390155, + "grad_norm": 0.012441801838576794, + "learning_rate": 9.76977256833379e-07, + "loss": 0.0, + "step": 2717 + }, + { + "epoch": 2.411438705386832, + "grad_norm": 0.13534188270568848, + "learning_rate": 9.741361994343867e-07, + "loss": 0.0007, + "step": 2718 + }, + { + "epoch": 2.4123254267346486, + "grad_norm": 0.004485164303332567, + "learning_rate": 9.712988330548096e-07, + "loss": 0.0, + "step": 2719 + }, + { + "epoch": 2.413212148082465, + "grad_norm": 0.005443104077130556, + "learning_rate": 9.6846516029602e-07, + "loss": 0.0, + "step": 2720 + }, + { + "epoch": 2.4140988694302816, + "grad_norm": 0.0010421723127365112, + "learning_rate": 9.65635183756007e-07, + "loss": 0.0, + "step": 2721 + }, + { + "epoch": 2.414985590778098, + "grad_norm": 0.1741264909505844, + "learning_rate": 9.628089060293643e-07, + "loss": 0.0009, + "step": 2722 + }, + { + "epoch": 2.4158723121259142, + "grad_norm": 0.05585017427802086, + "learning_rate": 9.59986329707302e-07, + "loss": 0.0002, + "step": 2723 + }, + { + "epoch": 2.4167590334737308, + "grad_norm": 0.0028329435735940933, + "learning_rate": 9.57167457377633e-07, + "loss": 0.0, + "step": 2724 + }, + { + "epoch": 2.4176457548215473, + "grad_norm": 0.002247181022539735, + "learning_rate": 9.543522916247749e-07, + "loss": 0.0, + "step": 2725 + }, + { + "epoch": 2.418532476169364, + "grad_norm": 0.020004788413643837, + "learning_rate": 9.515408350297439e-07, + "loss": 0.0001, + "step": 2726 + }, + { + "epoch": 2.4194191975171804, + "grad_norm": 0.012717186473309994, + "learning_rate": 9.487330901701636e-07, + "loss": 0.0, + "step": 2727 + }, + { + "epoch": 2.4203059188649965, + "grad_norm": 0.012186716310679913, + "learning_rate": 9.459290596202453e-07, + "loss": 0.0, + "step": 2728 + }, + { + "epoch": 2.421192640212813, + "grad_norm": 0.003059729468077421, + "learning_rate": 9.431287459508004e-07, + "loss": 0.0, + "step": 2729 + }, + { + "epoch": 2.4220793615606295, + "grad_norm": 0.005862931255251169, + "learning_rate": 9.403321517292313e-07, + "loss": 0.0, + "step": 2730 + }, + { + "epoch": 2.422966082908446, + "grad_norm": 0.012926910072565079, + "learning_rate": 9.375392795195315e-07, + "loss": 0.0, + "step": 2731 + }, + { + "epoch": 2.4238528042562626, + "grad_norm": 0.009246543981134892, + "learning_rate": 9.347501318822783e-07, + "loss": 0.0, + "step": 2732 + }, + { + "epoch": 2.424739525604079, + "grad_norm": 0.033203281462192535, + "learning_rate": 9.319647113746383e-07, + "loss": 0.0001, + "step": 2733 + }, + { + "epoch": 2.425626246951895, + "grad_norm": 0.0029225186444818974, + "learning_rate": 9.291830205503588e-07, + "loss": 0.0, + "step": 2734 + }, + { + "epoch": 2.4265129682997117, + "grad_norm": 0.00041219202103093266, + "learning_rate": 9.264050619597697e-07, + "loss": 0.0, + "step": 2735 + }, + { + "epoch": 2.4273996896475283, + "grad_norm": 0.012741847895085812, + "learning_rate": 9.236308381497738e-07, + "loss": 0.0001, + "step": 2736 + }, + { + "epoch": 2.428286410995345, + "grad_norm": 0.04531732574105263, + "learning_rate": 9.208603516638587e-07, + "loss": 0.0002, + "step": 2737 + }, + { + "epoch": 2.4291731323431613, + "grad_norm": 0.006424774881452322, + "learning_rate": 9.180936050420763e-07, + "loss": 0.0, + "step": 2738 + }, + { + "epoch": 2.4300598536909774, + "grad_norm": 0.000581156462430954, + "learning_rate": 9.153306008210572e-07, + "loss": 0.0, + "step": 2739 + }, + { + "epoch": 2.430946575038794, + "grad_norm": 0.016396358609199524, + "learning_rate": 9.125713415339921e-07, + "loss": 0.0, + "step": 2740 + }, + { + "epoch": 2.4318332963866105, + "grad_norm": 0.0013209610478952527, + "learning_rate": 9.098158297106501e-07, + "loss": 0.0, + "step": 2741 + }, + { + "epoch": 2.432720017734427, + "grad_norm": 0.013338295742869377, + "learning_rate": 9.070640678773535e-07, + "loss": 0.0, + "step": 2742 + }, + { + "epoch": 2.4336067390822436, + "grad_norm": 0.002022724598646164, + "learning_rate": 9.043160585569926e-07, + "loss": 0.0, + "step": 2743 + }, + { + "epoch": 2.43449346043006, + "grad_norm": 0.3681130111217499, + "learning_rate": 9.015718042690152e-07, + "loss": 0.0006, + "step": 2744 + }, + { + "epoch": 2.435380181777876, + "grad_norm": 0.24407528340816498, + "learning_rate": 8.988313075294275e-07, + "loss": 0.0019, + "step": 2745 + }, + { + "epoch": 2.4362669031256927, + "grad_norm": 0.0003583901561796665, + "learning_rate": 8.960945708507907e-07, + "loss": 0.0, + "step": 2746 + }, + { + "epoch": 2.4371536244735093, + "grad_norm": 0.0018106335774064064, + "learning_rate": 8.93361596742216e-07, + "loss": 0.0, + "step": 2747 + }, + { + "epoch": 2.438040345821326, + "grad_norm": 1.3477489948272705, + "learning_rate": 8.906323877093681e-07, + "loss": 0.0048, + "step": 2748 + }, + { + "epoch": 2.438927067169142, + "grad_norm": 0.0024084593169391155, + "learning_rate": 8.879069462544593e-07, + "loss": 0.0, + "step": 2749 + }, + { + "epoch": 2.4398137885169584, + "grad_norm": 0.0646350234746933, + "learning_rate": 8.851852748762457e-07, + "loss": 0.0001, + "step": 2750 + }, + { + "epoch": 2.440700509864775, + "grad_norm": 0.13703598082065582, + "learning_rate": 8.824673760700298e-07, + "loss": 0.0002, + "step": 2751 + }, + { + "epoch": 2.4415872312125915, + "grad_norm": 0.0044349804520606995, + "learning_rate": 8.797532523276542e-07, + "loss": 0.0, + "step": 2752 + }, + { + "epoch": 2.442473952560408, + "grad_norm": 0.005696154665201902, + "learning_rate": 8.770429061374979e-07, + "loss": 0.0, + "step": 2753 + }, + { + "epoch": 2.4433606739082245, + "grad_norm": 0.0011021459940820932, + "learning_rate": 8.743363399844795e-07, + "loss": 0.0, + "step": 2754 + }, + { + "epoch": 2.4442473952560406, + "grad_norm": 0.051142916083335876, + "learning_rate": 8.716335563500511e-07, + "loss": 0.0001, + "step": 2755 + }, + { + "epoch": 2.445134116603857, + "grad_norm": 0.5838609933853149, + "learning_rate": 8.689345577121988e-07, + "loss": 0.0019, + "step": 2756 + }, + { + "epoch": 2.4460208379516737, + "grad_norm": 0.015151509083807468, + "learning_rate": 8.662393465454322e-07, + "loss": 0.0, + "step": 2757 + }, + { + "epoch": 2.44690755929949, + "grad_norm": 0.000398889125790447, + "learning_rate": 8.635479253207979e-07, + "loss": 0.0, + "step": 2758 + }, + { + "epoch": 2.4477942806473068, + "grad_norm": 0.08696360141038895, + "learning_rate": 8.608602965058594e-07, + "loss": 0.0003, + "step": 2759 + }, + { + "epoch": 2.448681001995123, + "grad_norm": 0.001075335661880672, + "learning_rate": 8.581764625647088e-07, + "loss": 0.0, + "step": 2760 + }, + { + "epoch": 2.4495677233429394, + "grad_norm": 0.025259824469685555, + "learning_rate": 8.554964259579524e-07, + "loss": 0.0001, + "step": 2761 + }, + { + "epoch": 2.450454444690756, + "grad_norm": 0.001714448444545269, + "learning_rate": 8.528201891427257e-07, + "loss": 0.0, + "step": 2762 + }, + { + "epoch": 2.4513411660385724, + "grad_norm": 0.005722302943468094, + "learning_rate": 8.501477545726688e-07, + "loss": 0.0, + "step": 2763 + }, + { + "epoch": 2.452227887386389, + "grad_norm": 0.016258642077445984, + "learning_rate": 8.474791246979436e-07, + "loss": 0.0001, + "step": 2764 + }, + { + "epoch": 2.4531146087342055, + "grad_norm": 0.005662992130964994, + "learning_rate": 8.448143019652199e-07, + "loss": 0.0, + "step": 2765 + }, + { + "epoch": 2.4540013300820216, + "grad_norm": 0.006093035452067852, + "learning_rate": 8.421532888176808e-07, + "loss": 0.0, + "step": 2766 + }, + { + "epoch": 2.454888051429838, + "grad_norm": 0.38589486479759216, + "learning_rate": 8.394960876950109e-07, + "loss": 0.003, + "step": 2767 + }, + { + "epoch": 2.4557747727776547, + "grad_norm": 0.007774090860038996, + "learning_rate": 8.368427010334052e-07, + "loss": 0.0, + "step": 2768 + }, + { + "epoch": 2.456661494125471, + "grad_norm": 0.00017777753237169236, + "learning_rate": 8.341931312655582e-07, + "loss": 0.0, + "step": 2769 + }, + { + "epoch": 2.4575482154732873, + "grad_norm": 0.04939320310950279, + "learning_rate": 8.315473808206676e-07, + "loss": 0.0001, + "step": 2770 + }, + { + "epoch": 2.458434936821104, + "grad_norm": 0.0007739551365375519, + "learning_rate": 8.289054521244266e-07, + "loss": 0.0, + "step": 2771 + }, + { + "epoch": 2.4593216581689203, + "grad_norm": 0.015925316140055656, + "learning_rate": 8.262673475990285e-07, + "loss": 0.0001, + "step": 2772 + }, + { + "epoch": 2.460208379516737, + "grad_norm": 0.1478816717863083, + "learning_rate": 8.236330696631545e-07, + "loss": 0.0006, + "step": 2773 + }, + { + "epoch": 2.4610951008645534, + "grad_norm": 0.009654730558395386, + "learning_rate": 8.210026207319827e-07, + "loss": 0.0, + "step": 2774 + }, + { + "epoch": 2.46198182221237, + "grad_norm": 0.018739847466349602, + "learning_rate": 8.183760032171789e-07, + "loss": 0.0001, + "step": 2775 + }, + { + "epoch": 2.462868543560186, + "grad_norm": 0.011451950296759605, + "learning_rate": 8.157532195268963e-07, + "loss": 0.0, + "step": 2776 + }, + { + "epoch": 2.4637552649080026, + "grad_norm": 0.10962649434804916, + "learning_rate": 8.131342720657737e-07, + "loss": 0.0005, + "step": 2777 + }, + { + "epoch": 2.464641986255819, + "grad_norm": 0.01172646600753069, + "learning_rate": 8.105191632349291e-07, + "loss": 0.0, + "step": 2778 + }, + { + "epoch": 2.4655287076036356, + "grad_norm": 0.0004526283300947398, + "learning_rate": 8.07907895431968e-07, + "loss": 0.0, + "step": 2779 + }, + { + "epoch": 2.466415428951452, + "grad_norm": 0.012956739403307438, + "learning_rate": 8.053004710509676e-07, + "loss": 0.0001, + "step": 2780 + }, + { + "epoch": 2.4673021502992682, + "grad_norm": 0.0058890231885015965, + "learning_rate": 8.026968924824868e-07, + "loss": 0.0, + "step": 2781 + }, + { + "epoch": 2.468188871647085, + "grad_norm": 0.11938003450632095, + "learning_rate": 8.000971621135512e-07, + "loss": 0.0004, + "step": 2782 + }, + { + "epoch": 2.4690755929949013, + "grad_norm": 0.0003406075993552804, + "learning_rate": 7.97501282327669e-07, + "loss": 0.0, + "step": 2783 + }, + { + "epoch": 2.469962314342718, + "grad_norm": 0.0017089827451854944, + "learning_rate": 7.949092555048077e-07, + "loss": 0.0, + "step": 2784 + }, + { + "epoch": 2.4708490356905344, + "grad_norm": 0.03521645814180374, + "learning_rate": 7.923210840214085e-07, + "loss": 0.0001, + "step": 2785 + }, + { + "epoch": 2.471735757038351, + "grad_norm": 0.0007434281869791448, + "learning_rate": 7.897367702503755e-07, + "loss": 0.0, + "step": 2786 + }, + { + "epoch": 2.472622478386167, + "grad_norm": 0.0026796984020620584, + "learning_rate": 7.871563165610779e-07, + "loss": 0.0, + "step": 2787 + }, + { + "epoch": 2.4735091997339835, + "grad_norm": 0.0020774805452674627, + "learning_rate": 7.845797253193427e-07, + "loss": 0.0, + "step": 2788 + }, + { + "epoch": 2.4743959210818, + "grad_norm": 0.015672534704208374, + "learning_rate": 7.820069988874585e-07, + "loss": 0.0, + "step": 2789 + }, + { + "epoch": 2.4752826424296166, + "grad_norm": 0.014547284692525864, + "learning_rate": 7.79438139624169e-07, + "loss": 0.0001, + "step": 2790 + }, + { + "epoch": 2.476169363777433, + "grad_norm": 0.15646186470985413, + "learning_rate": 7.768731498846749e-07, + "loss": 0.0006, + "step": 2791 + }, + { + "epoch": 2.477056085125249, + "grad_norm": 0.0015324123669415712, + "learning_rate": 7.743120320206232e-07, + "loss": 0.0, + "step": 2792 + }, + { + "epoch": 2.4779428064730658, + "grad_norm": 0.004034361802041531, + "learning_rate": 7.717547883801197e-07, + "loss": 0.0, + "step": 2793 + }, + { + "epoch": 2.4788295278208823, + "grad_norm": 0.000717060174793005, + "learning_rate": 7.692014213077109e-07, + "loss": 0.0, + "step": 2794 + }, + { + "epoch": 2.479716249168699, + "grad_norm": 0.060480207204818726, + "learning_rate": 7.666519331443928e-07, + "loss": 0.0002, + "step": 2795 + }, + { + "epoch": 2.4806029705165153, + "grad_norm": 0.002845141338184476, + "learning_rate": 7.641063262276039e-07, + "loss": 0.0, + "step": 2796 + }, + { + "epoch": 2.481489691864332, + "grad_norm": 0.11835973709821701, + "learning_rate": 7.615646028912243e-07, + "loss": 0.0002, + "step": 2797 + }, + { + "epoch": 2.482376413212148, + "grad_norm": 0.04561325162649155, + "learning_rate": 7.590267654655758e-07, + "loss": 0.0001, + "step": 2798 + }, + { + "epoch": 2.4832631345599645, + "grad_norm": 0.09880010783672333, + "learning_rate": 7.564928162774127e-07, + "loss": 0.0003, + "step": 2799 + }, + { + "epoch": 2.484149855907781, + "grad_norm": 0.4014248847961426, + "learning_rate": 7.539627576499287e-07, + "loss": 0.0002, + "step": 2800 + }, + { + "epoch": 2.4850365772555976, + "grad_norm": 0.00363646587356925, + "learning_rate": 7.514365919027483e-07, + "loss": 0.0, + "step": 2801 + }, + { + "epoch": 2.4859232986034137, + "grad_norm": 0.6536305546760559, + "learning_rate": 7.489143213519301e-07, + "loss": 0.0043, + "step": 2802 + }, + { + "epoch": 2.48681001995123, + "grad_norm": 0.00101925665512681, + "learning_rate": 7.463959483099547e-07, + "loss": 0.0, + "step": 2803 + }, + { + "epoch": 2.4876967412990467, + "grad_norm": 0.0003748614399228245, + "learning_rate": 7.438814750857387e-07, + "loss": 0.0, + "step": 2804 + }, + { + "epoch": 2.4885834626468633, + "grad_norm": 0.1380384862422943, + "learning_rate": 7.41370903984615e-07, + "loss": 0.0001, + "step": 2805 + }, + { + "epoch": 2.48947018399468, + "grad_norm": 0.028476646170020103, + "learning_rate": 7.388642373083438e-07, + "loss": 0.0001, + "step": 2806 + }, + { + "epoch": 2.4903569053424963, + "grad_norm": 0.01267907302826643, + "learning_rate": 7.363614773551015e-07, + "loss": 0.0, + "step": 2807 + }, + { + "epoch": 2.4912436266903124, + "grad_norm": 0.005699229426681995, + "learning_rate": 7.338626264194893e-07, + "loss": 0.0, + "step": 2808 + }, + { + "epoch": 2.492130348038129, + "grad_norm": 0.0008996912511065602, + "learning_rate": 7.313676867925168e-07, + "loss": 0.0, + "step": 2809 + }, + { + "epoch": 2.4930170693859455, + "grad_norm": 0.0016446348745375872, + "learning_rate": 7.288766607616132e-07, + "loss": 0.0, + "step": 2810 + }, + { + "epoch": 2.493903790733762, + "grad_norm": 0.0015573484124615788, + "learning_rate": 7.263895506106167e-07, + "loss": 0.0, + "step": 2811 + }, + { + "epoch": 2.4947905120815785, + "grad_norm": 0.7243852615356445, + "learning_rate": 7.239063586197786e-07, + "loss": 0.0028, + "step": 2812 + }, + { + "epoch": 2.4956772334293946, + "grad_norm": 0.030200662091374397, + "learning_rate": 7.21427087065752e-07, + "loss": 0.0001, + "step": 2813 + }, + { + "epoch": 2.496563954777211, + "grad_norm": 0.0018416463863104582, + "learning_rate": 7.18951738221601e-07, + "loss": 0.0, + "step": 2814 + }, + { + "epoch": 2.4974506761250277, + "grad_norm": 0.000541276705916971, + "learning_rate": 7.164803143567922e-07, + "loss": 0.0, + "step": 2815 + }, + { + "epoch": 2.4983373974728442, + "grad_norm": 0.0023916105274111032, + "learning_rate": 7.14012817737193e-07, + "loss": 0.0, + "step": 2816 + }, + { + "epoch": 2.4992241188206608, + "grad_norm": 0.0036319156643003225, + "learning_rate": 7.11549250625067e-07, + "loss": 0.0, + "step": 2817 + }, + { + "epoch": 2.5001108401684773, + "grad_norm": 0.020262181758880615, + "learning_rate": 7.090896152790838e-07, + "loss": 0.0001, + "step": 2818 + }, + { + "epoch": 2.5009975615162934, + "grad_norm": 0.0017561853164806962, + "learning_rate": 7.066339139542983e-07, + "loss": 0.0, + "step": 2819 + }, + { + "epoch": 2.50188428286411, + "grad_norm": 0.0007896814495325089, + "learning_rate": 7.041821489021639e-07, + "loss": 0.0, + "step": 2820 + }, + { + "epoch": 2.50188428286411, + "eval_loss": 0.02093465067446232, + "eval_runtime": 60.5435, + "eval_samples_per_second": 3.171, + "eval_steps_per_second": 0.793, + "step": 2820 + }, + { + "epoch": 2.5027710042119264, + "grad_norm": 0.2156534045934677, + "learning_rate": 7.017343223705253e-07, + "loss": 0.0009, + "step": 2821 + }, + { + "epoch": 2.503657725559743, + "grad_norm": 0.0018699497450143099, + "learning_rate": 6.992904366036152e-07, + "loss": 0.0, + "step": 2822 + }, + { + "epoch": 2.504544446907559, + "grad_norm": 0.012214919552206993, + "learning_rate": 6.968504938420534e-07, + "loss": 0.0, + "step": 2823 + }, + { + "epoch": 2.5054311682553756, + "grad_norm": 0.0025382922030985355, + "learning_rate": 6.944144963228433e-07, + "loss": 0.0, + "step": 2824 + }, + { + "epoch": 2.506317889603192, + "grad_norm": 0.004923071712255478, + "learning_rate": 6.919824462793728e-07, + "loss": 0.0, + "step": 2825 + }, + { + "epoch": 2.5072046109510087, + "grad_norm": 0.06211470812559128, + "learning_rate": 6.895543459414112e-07, + "loss": 0.0003, + "step": 2826 + }, + { + "epoch": 2.508091332298825, + "grad_norm": 0.0008407292189076543, + "learning_rate": 6.871301975351063e-07, + "loss": 0.0, + "step": 2827 + }, + { + "epoch": 2.5089780536466417, + "grad_norm": 0.007816070690751076, + "learning_rate": 6.847100032829784e-07, + "loss": 0.0, + "step": 2828 + }, + { + "epoch": 2.5098647749944583, + "grad_norm": 0.06127519533038139, + "learning_rate": 6.822937654039313e-07, + "loss": 0.0001, + "step": 2829 + }, + { + "epoch": 2.5107514963422743, + "grad_norm": 0.010569370351731777, + "learning_rate": 6.79881486113233e-07, + "loss": 0.0, + "step": 2830 + }, + { + "epoch": 2.511638217690091, + "grad_norm": 0.000980584416538477, + "learning_rate": 6.774731676225261e-07, + "loss": 0.0, + "step": 2831 + }, + { + "epoch": 2.5125249390379074, + "grad_norm": 0.0009631279390305281, + "learning_rate": 6.750688121398219e-07, + "loss": 0.0, + "step": 2832 + }, + { + "epoch": 2.513411660385724, + "grad_norm": 0.002405727282166481, + "learning_rate": 6.726684218694984e-07, + "loss": 0.0, + "step": 2833 + }, + { + "epoch": 2.51429838173354, + "grad_norm": 0.0005489909672178328, + "learning_rate": 6.702719990122958e-07, + "loss": 0.0, + "step": 2834 + }, + { + "epoch": 2.5151851030813566, + "grad_norm": 0.005501510575413704, + "learning_rate": 6.678795457653186e-07, + "loss": 0.0, + "step": 2835 + }, + { + "epoch": 2.516071824429173, + "grad_norm": 0.003498939797282219, + "learning_rate": 6.65491064322033e-07, + "loss": 0.0, + "step": 2836 + }, + { + "epoch": 2.5169585457769896, + "grad_norm": 0.12416200339794159, + "learning_rate": 6.631065568722633e-07, + "loss": 0.0002, + "step": 2837 + }, + { + "epoch": 2.517845267124806, + "grad_norm": 0.0013328570639714599, + "learning_rate": 6.607260256021864e-07, + "loss": 0.0, + "step": 2838 + }, + { + "epoch": 2.5187319884726227, + "grad_norm": 0.00046582092181779444, + "learning_rate": 6.583494726943401e-07, + "loss": 0.0, + "step": 2839 + }, + { + "epoch": 2.519618709820439, + "grad_norm": 0.13117870688438416, + "learning_rate": 6.559769003276101e-07, + "loss": 0.0002, + "step": 2840 + }, + { + "epoch": 2.5205054311682553, + "grad_norm": 0.006309604272246361, + "learning_rate": 6.536083106772351e-07, + "loss": 0.0, + "step": 2841 + }, + { + "epoch": 2.521392152516072, + "grad_norm": 0.0005661431350745261, + "learning_rate": 6.512437059147986e-07, + "loss": 0.0, + "step": 2842 + }, + { + "epoch": 2.5222788738638884, + "grad_norm": 0.0011204613838344812, + "learning_rate": 6.48883088208237e-07, + "loss": 0.0, + "step": 2843 + }, + { + "epoch": 2.5231655952117045, + "grad_norm": 0.0654761791229248, + "learning_rate": 6.465264597218252e-07, + "loss": 0.0002, + "step": 2844 + }, + { + "epoch": 2.524052316559521, + "grad_norm": 0.021184800192713737, + "learning_rate": 6.441738226161842e-07, + "loss": 0.0, + "step": 2845 + }, + { + "epoch": 2.5249390379073375, + "grad_norm": 0.3001684546470642, + "learning_rate": 6.418251790482738e-07, + "loss": 0.0029, + "step": 2846 + }, + { + "epoch": 2.525825759255154, + "grad_norm": 0.09699290245771408, + "learning_rate": 6.394805311713936e-07, + "loss": 0.0003, + "step": 2847 + }, + { + "epoch": 2.5267124806029706, + "grad_norm": 0.021082287654280663, + "learning_rate": 6.371398811351804e-07, + "loss": 0.0001, + "step": 2848 + }, + { + "epoch": 2.527599201950787, + "grad_norm": 0.015293094329535961, + "learning_rate": 6.348032310856022e-07, + "loss": 0.0, + "step": 2849 + }, + { + "epoch": 2.5284859232986037, + "grad_norm": 0.00025907374219968915, + "learning_rate": 6.324705831649636e-07, + "loss": 0.0, + "step": 2850 + }, + { + "epoch": 2.5293726446464198, + "grad_norm": 0.003451921045780182, + "learning_rate": 6.301419395118985e-07, + "loss": 0.0, + "step": 2851 + }, + { + "epoch": 2.5302593659942363, + "grad_norm": 0.2557615041732788, + "learning_rate": 6.278173022613693e-07, + "loss": 0.0012, + "step": 2852 + }, + { + "epoch": 2.531146087342053, + "grad_norm": 0.020587336272001266, + "learning_rate": 6.25496673544666e-07, + "loss": 0.0001, + "step": 2853 + }, + { + "epoch": 2.5320328086898694, + "grad_norm": 0.004580834414809942, + "learning_rate": 6.231800554894029e-07, + "loss": 0.0, + "step": 2854 + }, + { + "epoch": 2.5329195300376854, + "grad_norm": 0.004284840542823076, + "learning_rate": 6.208674502195167e-07, + "loss": 0.0, + "step": 2855 + }, + { + "epoch": 2.533806251385502, + "grad_norm": 0.0342896431684494, + "learning_rate": 6.185588598552655e-07, + "loss": 0.0001, + "step": 2856 + }, + { + "epoch": 2.5346929727333185, + "grad_norm": 0.002259077737107873, + "learning_rate": 6.162542865132277e-07, + "loss": 0.0, + "step": 2857 + }, + { + "epoch": 2.535579694081135, + "grad_norm": 0.7990791201591492, + "learning_rate": 6.139537323062972e-07, + "loss": 0.0023, + "step": 2858 + }, + { + "epoch": 2.5364664154289516, + "grad_norm": 0.0004105519037693739, + "learning_rate": 6.116571993436815e-07, + "loss": 0.0, + "step": 2859 + }, + { + "epoch": 2.537353136776768, + "grad_norm": 0.001817844226025045, + "learning_rate": 6.093646897309074e-07, + "loss": 0.0, + "step": 2860 + }, + { + "epoch": 2.538239858124584, + "grad_norm": 0.05438079684972763, + "learning_rate": 6.070762055698054e-07, + "loss": 0.0001, + "step": 2861 + }, + { + "epoch": 2.5391265794724007, + "grad_norm": 0.03450392559170723, + "learning_rate": 6.047917489585208e-07, + "loss": 0.0001, + "step": 2862 + }, + { + "epoch": 2.5400133008202173, + "grad_norm": 0.21090054512023926, + "learning_rate": 6.025113219915024e-07, + "loss": 0.0005, + "step": 2863 + }, + { + "epoch": 2.540900022168034, + "grad_norm": 0.022419922053813934, + "learning_rate": 6.002349267595092e-07, + "loss": 0.0, + "step": 2864 + }, + { + "epoch": 2.54178674351585, + "grad_norm": 0.0006861055735498667, + "learning_rate": 5.979625653495996e-07, + "loss": 0.0, + "step": 2865 + }, + { + "epoch": 2.5426734648636664, + "grad_norm": 0.0008977435645647347, + "learning_rate": 5.956942398451349e-07, + "loss": 0.0, + "step": 2866 + }, + { + "epoch": 2.543560186211483, + "grad_norm": 0.0007839131285436451, + "learning_rate": 5.934299523257769e-07, + "loss": 0.0, + "step": 2867 + }, + { + "epoch": 2.5444469075592995, + "grad_norm": 0.020759517326951027, + "learning_rate": 5.911697048674864e-07, + "loss": 0.0001, + "step": 2868 + }, + { + "epoch": 2.545333628907116, + "grad_norm": 0.028356509283185005, + "learning_rate": 5.889134995425155e-07, + "loss": 0.0001, + "step": 2869 + }, + { + "epoch": 2.5462203502549325, + "grad_norm": 0.001867869053967297, + "learning_rate": 5.866613384194153e-07, + "loss": 0.0, + "step": 2870 + }, + { + "epoch": 2.547107071602749, + "grad_norm": 0.029476385563611984, + "learning_rate": 5.844132235630273e-07, + "loss": 0.0002, + "step": 2871 + }, + { + "epoch": 2.547993792950565, + "grad_norm": 0.014416356571018696, + "learning_rate": 5.821691570344829e-07, + "loss": 0.0, + "step": 2872 + }, + { + "epoch": 2.5488805142983817, + "grad_norm": 0.0014337707543745637, + "learning_rate": 5.799291408912022e-07, + "loss": 0.0, + "step": 2873 + }, + { + "epoch": 2.5497672356461982, + "grad_norm": 0.029101770371198654, + "learning_rate": 5.776931771868932e-07, + "loss": 0.0001, + "step": 2874 + }, + { + "epoch": 2.5506539569940148, + "grad_norm": 0.2410978376865387, + "learning_rate": 5.754612679715443e-07, + "loss": 0.0012, + "step": 2875 + }, + { + "epoch": 2.551540678341831, + "grad_norm": 0.018282808363437653, + "learning_rate": 5.732334152914315e-07, + "loss": 0.0, + "step": 2876 + }, + { + "epoch": 2.5524273996896474, + "grad_norm": 0.00830524880439043, + "learning_rate": 5.710096211891087e-07, + "loss": 0.0, + "step": 2877 + }, + { + "epoch": 2.553314121037464, + "grad_norm": 0.2229091078042984, + "learning_rate": 5.687898877034104e-07, + "loss": 0.0003, + "step": 2878 + }, + { + "epoch": 2.5542008423852804, + "grad_norm": 0.12925313413143158, + "learning_rate": 5.665742168694483e-07, + "loss": 0.0003, + "step": 2879 + }, + { + "epoch": 2.555087563733097, + "grad_norm": 0.007797439582645893, + "learning_rate": 5.643626107186051e-07, + "loss": 0.0, + "step": 2880 + }, + { + "epoch": 2.5559742850809135, + "grad_norm": 0.12223513424396515, + "learning_rate": 5.621550712785445e-07, + "loss": 0.0001, + "step": 2881 + }, + { + "epoch": 2.55686100642873, + "grad_norm": 0.0494576133787632, + "learning_rate": 5.599516005731953e-07, + "loss": 0.0002, + "step": 2882 + }, + { + "epoch": 2.557747727776546, + "grad_norm": 0.010439654812216759, + "learning_rate": 5.577522006227598e-07, + "loss": 0.0, + "step": 2883 + }, + { + "epoch": 2.5586344491243627, + "grad_norm": 0.000774744083173573, + "learning_rate": 5.555568734437039e-07, + "loss": 0.0, + "step": 2884 + }, + { + "epoch": 2.559521170472179, + "grad_norm": 0.0021131117828190327, + "learning_rate": 5.533656210487654e-07, + "loss": 0.0, + "step": 2885 + }, + { + "epoch": 2.5604078918199957, + "grad_norm": 0.0041869510896503925, + "learning_rate": 5.511784454469404e-07, + "loss": 0.0, + "step": 2886 + }, + { + "epoch": 2.561294613167812, + "grad_norm": 0.010568622499704361, + "learning_rate": 5.489953486434913e-07, + "loss": 0.0, + "step": 2887 + }, + { + "epoch": 2.5621813345156284, + "grad_norm": 0.0009484928450547159, + "learning_rate": 5.46816332639939e-07, + "loss": 0.0, + "step": 2888 + }, + { + "epoch": 2.563068055863445, + "grad_norm": 0.0006807691534049809, + "learning_rate": 5.446413994340649e-07, + "loss": 0.0, + "step": 2889 + }, + { + "epoch": 2.5639547772112614, + "grad_norm": 0.7828634977340698, + "learning_rate": 5.424705510199036e-07, + "loss": 0.0015, + "step": 2890 + }, + { + "epoch": 2.564841498559078, + "grad_norm": 0.04002891480922699, + "learning_rate": 5.403037893877478e-07, + "loss": 0.0001, + "step": 2891 + }, + { + "epoch": 2.5657282199068945, + "grad_norm": 0.2507193982601166, + "learning_rate": 5.381411165241429e-07, + "loss": 0.0018, + "step": 2892 + }, + { + "epoch": 2.5666149412547106, + "grad_norm": 0.0006097395671531558, + "learning_rate": 5.359825344118858e-07, + "loss": 0.0, + "step": 2893 + }, + { + "epoch": 2.567501662602527, + "grad_norm": 0.048260822892189026, + "learning_rate": 5.338280450300187e-07, + "loss": 0.0001, + "step": 2894 + }, + { + "epoch": 2.5683883839503436, + "grad_norm": 0.029657047241926193, + "learning_rate": 5.316776503538401e-07, + "loss": 0.0001, + "step": 2895 + }, + { + "epoch": 2.56927510529816, + "grad_norm": 0.010401280596852303, + "learning_rate": 5.295313523548851e-07, + "loss": 0.0, + "step": 2896 + }, + { + "epoch": 2.5701618266459763, + "grad_norm": 0.0022809342481195927, + "learning_rate": 5.27389153000939e-07, + "loss": 0.0, + "step": 2897 + }, + { + "epoch": 2.571048547993793, + "grad_norm": 0.005268683657050133, + "learning_rate": 5.252510542560268e-07, + "loss": 0.0, + "step": 2898 + }, + { + "epoch": 2.5719352693416093, + "grad_norm": 0.0015263669192790985, + "learning_rate": 5.231170580804146e-07, + "loss": 0.0, + "step": 2899 + }, + { + "epoch": 2.572821990689426, + "grad_norm": 0.0002925303124357015, + "learning_rate": 5.209871664306076e-07, + "loss": 0.0, + "step": 2900 + }, + { + "epoch": 2.5737087120372424, + "grad_norm": 0.4949420094490051, + "learning_rate": 5.188613812593463e-07, + "loss": 0.0053, + "step": 2901 + }, + { + "epoch": 2.574595433385059, + "grad_norm": 0.005495188757777214, + "learning_rate": 5.167397045156075e-07, + "loss": 0.0, + "step": 2902 + }, + { + "epoch": 2.5754821547328755, + "grad_norm": 0.001007727812975645, + "learning_rate": 5.14622138144602e-07, + "loss": 0.0, + "step": 2903 + }, + { + "epoch": 2.5763688760806915, + "grad_norm": 0.003264603205025196, + "learning_rate": 5.125086840877707e-07, + "loss": 0.0, + "step": 2904 + }, + { + "epoch": 2.577255597428508, + "grad_norm": 0.001805190579034388, + "learning_rate": 5.103993442827832e-07, + "loss": 0.0, + "step": 2905 + }, + { + "epoch": 2.5781423187763246, + "grad_norm": 0.3015930950641632, + "learning_rate": 5.082941206635417e-07, + "loss": 0.0008, + "step": 2906 + }, + { + "epoch": 2.579029040124141, + "grad_norm": 0.14610321819782257, + "learning_rate": 5.06193015160169e-07, + "loss": 0.0002, + "step": 2907 + }, + { + "epoch": 2.5799157614719572, + "grad_norm": 0.003515577409416437, + "learning_rate": 5.04096029699016e-07, + "loss": 0.0, + "step": 2908 + }, + { + "epoch": 2.5808024828197738, + "grad_norm": 0.0014688996598124504, + "learning_rate": 5.020031662026526e-07, + "loss": 0.0, + "step": 2909 + }, + { + "epoch": 2.5816892041675903, + "grad_norm": 0.006348750554025173, + "learning_rate": 4.999144265898753e-07, + "loss": 0.0, + "step": 2910 + }, + { + "epoch": 2.582575925515407, + "grad_norm": 0.6952155232429504, + "learning_rate": 4.978298127756937e-07, + "loss": 0.001, + "step": 2911 + }, + { + "epoch": 2.5834626468632234, + "grad_norm": 0.004262842237949371, + "learning_rate": 4.957493266713387e-07, + "loss": 0.0, + "step": 2912 + }, + { + "epoch": 2.58434936821104, + "grad_norm": 0.0013588215224444866, + "learning_rate": 4.936729701842552e-07, + "loss": 0.0, + "step": 2913 + }, + { + "epoch": 2.585236089558856, + "grad_norm": 0.008707460016012192, + "learning_rate": 4.916007452181032e-07, + "loss": 0.0, + "step": 2914 + }, + { + "epoch": 2.5861228109066725, + "grad_norm": 0.20416337251663208, + "learning_rate": 4.895326536727518e-07, + "loss": 0.0003, + "step": 2915 + }, + { + "epoch": 2.587009532254489, + "grad_norm": 0.05300324037671089, + "learning_rate": 4.874686974442839e-07, + "loss": 0.0002, + "step": 2916 + }, + { + "epoch": 2.5878962536023056, + "grad_norm": 0.8284305334091187, + "learning_rate": 4.854088784249889e-07, + "loss": 0.0077, + "step": 2917 + }, + { + "epoch": 2.5887829749501217, + "grad_norm": 0.00847573857754469, + "learning_rate": 4.833531985033657e-07, + "loss": 0.0001, + "step": 2918 + }, + { + "epoch": 2.589669696297938, + "grad_norm": 0.05608953908085823, + "learning_rate": 4.813016595641135e-07, + "loss": 0.0001, + "step": 2919 + }, + { + "epoch": 2.5905564176457547, + "grad_norm": 0.003506513312458992, + "learning_rate": 4.792542634881414e-07, + "loss": 0.0, + "step": 2920 + }, + { + "epoch": 2.5914431389935713, + "grad_norm": 0.030793076381087303, + "learning_rate": 4.772110121525548e-07, + "loss": 0.0001, + "step": 2921 + }, + { + "epoch": 2.592329860341388, + "grad_norm": 0.1802815943956375, + "learning_rate": 4.751719074306604e-07, + "loss": 0.001, + "step": 2922 + }, + { + "epoch": 2.5932165816892043, + "grad_norm": 0.0012924959883093834, + "learning_rate": 4.731369511919653e-07, + "loss": 0.0, + "step": 2923 + }, + { + "epoch": 2.594103303037021, + "grad_norm": 0.002344125881791115, + "learning_rate": 4.7110614530217147e-07, + "loss": 0.0, + "step": 2924 + }, + { + "epoch": 2.594990024384837, + "grad_norm": 0.018599161878228188, + "learning_rate": 4.6907949162317667e-07, + "loss": 0.0001, + "step": 2925 + }, + { + "epoch": 2.5958767457326535, + "grad_norm": 0.0013417156878858805, + "learning_rate": 4.670569920130691e-07, + "loss": 0.0, + "step": 2926 + }, + { + "epoch": 2.59676346708047, + "grad_norm": 0.29868119955062866, + "learning_rate": 4.65038648326131e-07, + "loss": 0.0029, + "step": 2927 + }, + { + "epoch": 2.5976501884282865, + "grad_norm": 0.030218416824936867, + "learning_rate": 4.630244624128349e-07, + "loss": 0.0001, + "step": 2928 + }, + { + "epoch": 2.5985369097761026, + "grad_norm": 0.2323320060968399, + "learning_rate": 4.610144361198399e-07, + "loss": 0.0009, + "step": 2929 + }, + { + "epoch": 2.599423631123919, + "grad_norm": 0.0015396233648061752, + "learning_rate": 4.5900857128998997e-07, + "loss": 0.0, + "step": 2930 + }, + { + "epoch": 2.6003103524717357, + "grad_norm": 0.3064575493335724, + "learning_rate": 4.5700686976231865e-07, + "loss": 0.0006, + "step": 2931 + }, + { + "epoch": 2.6011970738195522, + "grad_norm": 0.0025372328236699104, + "learning_rate": 4.550093333720368e-07, + "loss": 0.0, + "step": 2932 + }, + { + "epoch": 2.6020837951673688, + "grad_norm": 0.0003127675736322999, + "learning_rate": 4.5301596395053993e-07, + "loss": 0.0, + "step": 2933 + }, + { + "epoch": 2.6029705165151853, + "grad_norm": 0.0011001265374943614, + "learning_rate": 4.5102676332540284e-07, + "loss": 0.0, + "step": 2934 + }, + { + "epoch": 2.603857237863002, + "grad_norm": 0.40902015566825867, + "learning_rate": 4.490417333203778e-07, + "loss": 0.0028, + "step": 2935 + }, + { + "epoch": 2.604743959210818, + "grad_norm": 0.06541017442941666, + "learning_rate": 4.470608757553918e-07, + "loss": 0.0002, + "step": 2936 + }, + { + "epoch": 2.6056306805586344, + "grad_norm": 0.08096663653850555, + "learning_rate": 4.4508419244654934e-07, + "loss": 0.0004, + "step": 2937 + }, + { + "epoch": 2.606517401906451, + "grad_norm": 0.11979025602340698, + "learning_rate": 4.431116852061257e-07, + "loss": 0.0006, + "step": 2938 + }, + { + "epoch": 2.607404123254267, + "grad_norm": 0.00492709269747138, + "learning_rate": 4.4114335584256986e-07, + "loss": 0.0, + "step": 2939 + }, + { + "epoch": 2.6082908446020836, + "grad_norm": 0.004521166905760765, + "learning_rate": 4.3917920616049546e-07, + "loss": 0.0, + "step": 2940 + }, + { + "epoch": 2.6091775659499, + "grad_norm": 0.0010443405481055379, + "learning_rate": 4.37219237960691e-07, + "loss": 0.0, + "step": 2941 + }, + { + "epoch": 2.6100642872977167, + "grad_norm": 0.0015108479419723153, + "learning_rate": 4.352634530401051e-07, + "loss": 0.0, + "step": 2942 + }, + { + "epoch": 2.610951008645533, + "grad_norm": 0.20358669757843018, + "learning_rate": 4.3331185319185567e-07, + "loss": 0.0006, + "step": 2943 + }, + { + "epoch": 2.6118377299933497, + "grad_norm": 0.1645130217075348, + "learning_rate": 4.313644402052186e-07, + "loss": 0.0011, + "step": 2944 + }, + { + "epoch": 2.6127244513411663, + "grad_norm": 0.002132191089913249, + "learning_rate": 4.2942121586563745e-07, + "loss": 0.0, + "step": 2945 + }, + { + "epoch": 2.6136111726889824, + "grad_norm": 0.005285217426717281, + "learning_rate": 4.274821819547098e-07, + "loss": 0.0, + "step": 2946 + }, + { + "epoch": 2.614497894036799, + "grad_norm": 0.0008985325694084167, + "learning_rate": 4.255473402501947e-07, + "loss": 0.0, + "step": 2947 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 0.0016815599519759417, + "learning_rate": 4.23616692526006e-07, + "loss": 0.0, + "step": 2948 + }, + { + "epoch": 2.616271336732432, + "grad_norm": 0.0033405658323317766, + "learning_rate": 4.2169024055221385e-07, + "loss": 0.0, + "step": 2949 + }, + { + "epoch": 2.617158058080248, + "grad_norm": 0.11762901395559311, + "learning_rate": 4.1976798609504155e-07, + "loss": 0.0003, + "step": 2950 + }, + { + "epoch": 2.6180447794280646, + "grad_norm": 0.0011727182427421212, + "learning_rate": 4.1784993091686156e-07, + "loss": 0.0, + "step": 2951 + }, + { + "epoch": 2.618931500775881, + "grad_norm": 0.16032883524894714, + "learning_rate": 4.159360767761989e-07, + "loss": 0.0004, + "step": 2952 + }, + { + "epoch": 2.6198182221236976, + "grad_norm": 0.0036196603905409575, + "learning_rate": 4.1402642542772544e-07, + "loss": 0.0, + "step": 2953 + }, + { + "epoch": 2.620704943471514, + "grad_norm": 0.0020090683829039335, + "learning_rate": 4.1212097862226186e-07, + "loss": 0.0, + "step": 2954 + }, + { + "epoch": 2.6215916648193307, + "grad_norm": 0.03135655075311661, + "learning_rate": 4.1021973810677187e-07, + "loss": 0.0001, + "step": 2955 + }, + { + "epoch": 2.6224783861671472, + "grad_norm": 0.10065038502216339, + "learning_rate": 4.0832270562436436e-07, + "loss": 0.0004, + "step": 2956 + }, + { + "epoch": 2.6233651075149633, + "grad_norm": 0.23414331674575806, + "learning_rate": 4.0642988291428764e-07, + "loss": 0.0013, + "step": 2957 + }, + { + "epoch": 2.62425182886278, + "grad_norm": 0.039282117038965225, + "learning_rate": 4.0454127171193347e-07, + "loss": 0.0001, + "step": 2958 + }, + { + "epoch": 2.6251385502105964, + "grad_norm": 0.0033170965034514666, + "learning_rate": 4.0265687374883014e-07, + "loss": 0.0, + "step": 2959 + }, + { + "epoch": 2.626025271558413, + "grad_norm": 0.06266362965106964, + "learning_rate": 4.0077669075264555e-07, + "loss": 0.0003, + "step": 2960 + }, + { + "epoch": 2.626911992906229, + "grad_norm": 0.002013074466958642, + "learning_rate": 3.9890072444717933e-07, + "loss": 0.0, + "step": 2961 + }, + { + "epoch": 2.6277987142540455, + "grad_norm": 0.059029195457696915, + "learning_rate": 3.970289765523699e-07, + "loss": 0.0001, + "step": 2962 + }, + { + "epoch": 2.628685435601862, + "grad_norm": 0.005892843008041382, + "learning_rate": 3.951614487842842e-07, + "loss": 0.0, + "step": 2963 + }, + { + "epoch": 2.6295721569496786, + "grad_norm": 0.26164448261260986, + "learning_rate": 3.932981428551225e-07, + "loss": 0.0069, + "step": 2964 + }, + { + "epoch": 2.630458878297495, + "grad_norm": 0.4535243511199951, + "learning_rate": 3.914390604732116e-07, + "loss": 0.0009, + "step": 2965 + }, + { + "epoch": 2.6313455996453117, + "grad_norm": 0.0008455055649392307, + "learning_rate": 3.895842033430103e-07, + "loss": 0.0, + "step": 2966 + }, + { + "epoch": 2.6322323209931278, + "grad_norm": 0.0007907512481324375, + "learning_rate": 3.877335731650994e-07, + "loss": 0.0, + "step": 2967 + }, + { + "epoch": 2.6331190423409443, + "grad_norm": 0.0056696608662605286, + "learning_rate": 3.8588717163618673e-07, + "loss": 0.0, + "step": 2968 + }, + { + "epoch": 2.634005763688761, + "grad_norm": 0.0016626459546387196, + "learning_rate": 3.8404500044910197e-07, + "loss": 0.0, + "step": 2969 + }, + { + "epoch": 2.6348924850365774, + "grad_norm": 0.013582353480160236, + "learning_rate": 3.822070612927986e-07, + "loss": 0.0, + "step": 2970 + }, + { + "epoch": 2.6357792063843934, + "grad_norm": 0.03314371407032013, + "learning_rate": 3.803733558523459e-07, + "loss": 0.0001, + "step": 2971 + }, + { + "epoch": 2.63666592773221, + "grad_norm": 0.02767125330865383, + "learning_rate": 3.7854388580893463e-07, + "loss": 0.0, + "step": 2972 + }, + { + "epoch": 2.6375526490800265, + "grad_norm": 0.09225767105817795, + "learning_rate": 3.7671865283987254e-07, + "loss": 0.0002, + "step": 2973 + }, + { + "epoch": 2.638439370427843, + "grad_norm": 0.005085418000817299, + "learning_rate": 3.7489765861858205e-07, + "loss": 0.0, + "step": 2974 + }, + { + "epoch": 2.6393260917756596, + "grad_norm": 0.0008364695822820067, + "learning_rate": 3.7308090481459836e-07, + "loss": 0.0, + "step": 2975 + }, + { + "epoch": 2.640212813123476, + "grad_norm": 0.005674805957823992, + "learning_rate": 3.712683930935701e-07, + "loss": 0.0, + "step": 2976 + }, + { + "epoch": 2.6410995344712926, + "grad_norm": 0.021331392228603363, + "learning_rate": 3.6946012511725806e-07, + "loss": 0.0001, + "step": 2977 + }, + { + "epoch": 2.6419862558191087, + "grad_norm": 0.030648577958345413, + "learning_rate": 3.676561025435277e-07, + "loss": 0.0001, + "step": 2978 + }, + { + "epoch": 2.6428729771669253, + "grad_norm": 0.006878063082695007, + "learning_rate": 3.6585632702635653e-07, + "loss": 0.0, + "step": 2979 + }, + { + "epoch": 2.643759698514742, + "grad_norm": 0.0028057079762220383, + "learning_rate": 3.6406080021582747e-07, + "loss": 0.0, + "step": 2980 + }, + { + "epoch": 2.6446464198625583, + "grad_norm": 0.12466216087341309, + "learning_rate": 3.62269523758127e-07, + "loss": 0.0004, + "step": 2981 + }, + { + "epoch": 2.6455331412103744, + "grad_norm": 0.016507942229509354, + "learning_rate": 3.604824992955436e-07, + "loss": 0.0001, + "step": 2982 + }, + { + "epoch": 2.646419862558191, + "grad_norm": 0.004891809541732073, + "learning_rate": 3.586997284664723e-07, + "loss": 0.0, + "step": 2983 + }, + { + "epoch": 2.6473065839060075, + "grad_norm": 0.0006478436989709735, + "learning_rate": 3.569212129054017e-07, + "loss": 0.0, + "step": 2984 + }, + { + "epoch": 2.648193305253824, + "grad_norm": 0.009865147061645985, + "learning_rate": 3.551469542429248e-07, + "loss": 0.0, + "step": 2985 + }, + { + "epoch": 2.6490800266016405, + "grad_norm": 0.04954991117119789, + "learning_rate": 3.5337695410572694e-07, + "loss": 0.0001, + "step": 2986 + }, + { + "epoch": 2.649966747949457, + "grad_norm": 0.002256659558042884, + "learning_rate": 3.5161121411659395e-07, + "loss": 0.0, + "step": 2987 + }, + { + "epoch": 2.650853469297273, + "grad_norm": 0.00758238323032856, + "learning_rate": 3.4984973589440143e-07, + "loss": 0.0, + "step": 2988 + }, + { + "epoch": 2.6517401906450897, + "grad_norm": 0.017352283000946045, + "learning_rate": 3.480925210541203e-07, + "loss": 0.0001, + "step": 2989 + }, + { + "epoch": 2.6526269119929062, + "grad_norm": 0.0017505744472146034, + "learning_rate": 3.4633957120681294e-07, + "loss": 0.0, + "step": 2990 + }, + { + "epoch": 2.6535136333407228, + "grad_norm": 0.005053408443927765, + "learning_rate": 3.445908879596299e-07, + "loss": 0.0, + "step": 2991 + }, + { + "epoch": 2.654400354688539, + "grad_norm": 0.011921010911464691, + "learning_rate": 3.4284647291580984e-07, + "loss": 0.0, + "step": 2992 + }, + { + "epoch": 2.6552870760363554, + "grad_norm": 0.0022184166591614485, + "learning_rate": 3.411063276746801e-07, + "loss": 0.0, + "step": 2993 + }, + { + "epoch": 2.656173797384172, + "grad_norm": 0.001728870440274477, + "learning_rate": 3.3937045383165226e-07, + "loss": 0.0, + "step": 2994 + }, + { + "epoch": 2.6570605187319885, + "grad_norm": 0.0017493030754849315, + "learning_rate": 3.3763885297822153e-07, + "loss": 0.0, + "step": 2995 + }, + { + "epoch": 2.657947240079805, + "grad_norm": 0.050390731543302536, + "learning_rate": 3.359115267019647e-07, + "loss": 0.0002, + "step": 2996 + }, + { + "epoch": 2.6588339614276215, + "grad_norm": 0.00843139924108982, + "learning_rate": 3.341884765865433e-07, + "loss": 0.0, + "step": 2997 + }, + { + "epoch": 2.659720682775438, + "grad_norm": 0.1068384125828743, + "learning_rate": 3.324697042116931e-07, + "loss": 0.0003, + "step": 2998 + }, + { + "epoch": 2.660607404123254, + "grad_norm": 0.0007967590354382992, + "learning_rate": 3.3075521115323194e-07, + "loss": 0.0, + "step": 2999 + }, + { + "epoch": 2.6614941254710707, + "grad_norm": 0.028992870822548866, + "learning_rate": 3.290449989830519e-07, + "loss": 0.0, + "step": 3000 + }, + { + "epoch": 2.662380846818887, + "grad_norm": 0.017356855794787407, + "learning_rate": 3.2733906926912193e-07, + "loss": 0.0001, + "step": 3001 + }, + { + "epoch": 2.6632675681667037, + "grad_norm": 0.005700121633708477, + "learning_rate": 3.256374235754844e-07, + "loss": 0.0, + "step": 3002 + }, + { + "epoch": 2.66415428951452, + "grad_norm": 0.0009079916635528207, + "learning_rate": 3.239400634622519e-07, + "loss": 0.0, + "step": 3003 + }, + { + "epoch": 2.6650410108623364, + "grad_norm": 0.22687380015850067, + "learning_rate": 3.222469904856107e-07, + "loss": 0.001, + "step": 3004 + }, + { + "epoch": 2.665927732210153, + "grad_norm": 0.0053674750961363316, + "learning_rate": 3.2055820619781466e-07, + "loss": 0.0, + "step": 3005 + }, + { + "epoch": 2.6668144535579694, + "grad_norm": 0.0032461630180478096, + "learning_rate": 3.1887371214718687e-07, + "loss": 0.0, + "step": 3006 + }, + { + "epoch": 2.667701174905786, + "grad_norm": 0.07514698058366776, + "learning_rate": 3.1719350987811537e-07, + "loss": 0.0001, + "step": 3007 + }, + { + "epoch": 2.6685878962536025, + "grad_norm": 0.13264571130275726, + "learning_rate": 3.1551760093105675e-07, + "loss": 0.0003, + "step": 3008 + }, + { + "epoch": 2.669474617601419, + "grad_norm": 0.0013396762078627944, + "learning_rate": 3.1384598684252643e-07, + "loss": 0.0, + "step": 3009 + }, + { + "epoch": 2.670361338949235, + "grad_norm": 0.00618441728875041, + "learning_rate": 3.121786691451062e-07, + "loss": 0.0, + "step": 3010 + }, + { + "epoch": 2.6712480602970516, + "grad_norm": 0.002804888878017664, + "learning_rate": 3.105156493674377e-07, + "loss": 0.0, + "step": 3011 + }, + { + "epoch": 2.672134781644868, + "grad_norm": 0.010229287669062614, + "learning_rate": 3.0885692903422194e-07, + "loss": 0.0, + "step": 3012 + }, + { + "epoch": 2.6730215029926847, + "grad_norm": 0.024806374683976173, + "learning_rate": 3.0720250966621624e-07, + "loss": 0.0001, + "step": 3013 + }, + { + "epoch": 2.673908224340501, + "grad_norm": 0.01945449411869049, + "learning_rate": 3.0555239278023786e-07, + "loss": 0.0001, + "step": 3014 + }, + { + "epoch": 2.6747949456883173, + "grad_norm": 0.06463182717561722, + "learning_rate": 3.039065798891583e-07, + "loss": 0.0002, + "step": 3015 + }, + { + "epoch": 2.675681667036134, + "grad_norm": 0.0008560666465200484, + "learning_rate": 3.0226507250190274e-07, + "loss": 0.0, + "step": 3016 + }, + { + "epoch": 2.6765683883839504, + "grad_norm": 0.006548132281750441, + "learning_rate": 3.0062787212344793e-07, + "loss": 0.0, + "step": 3017 + }, + { + "epoch": 2.677455109731767, + "grad_norm": 0.013721938244998455, + "learning_rate": 2.989949802548231e-07, + "loss": 0.0, + "step": 3018 + }, + { + "epoch": 2.6783418310795835, + "grad_norm": 0.0003532420960254967, + "learning_rate": 2.9736639839310744e-07, + "loss": 0.0, + "step": 3019 + }, + { + "epoch": 2.6792285524273995, + "grad_norm": 0.001644345000386238, + "learning_rate": 2.9574212803142986e-07, + "loss": 0.0, + "step": 3020 + }, + { + "epoch": 2.680115273775216, + "grad_norm": 0.011774779297411442, + "learning_rate": 2.9412217065896085e-07, + "loss": 0.0, + "step": 3021 + }, + { + "epoch": 2.6810019951230326, + "grad_norm": 0.008171603083610535, + "learning_rate": 2.9250652776092513e-07, + "loss": 0.0, + "step": 3022 + }, + { + "epoch": 2.681888716470849, + "grad_norm": 0.01484227180480957, + "learning_rate": 2.9089520081858444e-07, + "loss": 0.0001, + "step": 3023 + }, + { + "epoch": 2.6827754378186652, + "grad_norm": 0.08469567447900772, + "learning_rate": 2.8928819130924656e-07, + "loss": 0.0003, + "step": 3024 + }, + { + "epoch": 2.6836621591664818, + "grad_norm": 0.017949476838111877, + "learning_rate": 2.8768550070626125e-07, + "loss": 0.0001, + "step": 3025 + }, + { + "epoch": 2.6845488805142983, + "grad_norm": 0.009285989217460155, + "learning_rate": 2.8608713047901805e-07, + "loss": 0.0, + "step": 3026 + }, + { + "epoch": 2.685435601862115, + "grad_norm": 0.0016875972505658865, + "learning_rate": 2.8449308209294645e-07, + "loss": 0.0, + "step": 3027 + }, + { + "epoch": 2.6863223232099314, + "grad_norm": 0.3143994212150574, + "learning_rate": 2.829033570095102e-07, + "loss": 0.0011, + "step": 3028 + }, + { + "epoch": 2.687209044557748, + "grad_norm": 0.001808093162253499, + "learning_rate": 2.813179566862134e-07, + "loss": 0.0, + "step": 3029 + }, + { + "epoch": 2.6880957659055644, + "grad_norm": 0.03166713938117027, + "learning_rate": 2.797368825765928e-07, + "loss": 0.0001, + "step": 3030 + }, + { + "epoch": 2.6889824872533805, + "grad_norm": 0.13500897586345673, + "learning_rate": 2.7816013613021995e-07, + "loss": 0.0003, + "step": 3031 + }, + { + "epoch": 2.689869208601197, + "grad_norm": 0.05272597447037697, + "learning_rate": 2.765877187926963e-07, + "loss": 0.0001, + "step": 3032 + }, + { + "epoch": 2.6907559299490136, + "grad_norm": 0.0012098954757675529, + "learning_rate": 2.7501963200565916e-07, + "loss": 0.0, + "step": 3033 + }, + { + "epoch": 2.69164265129683, + "grad_norm": 0.005726026836782694, + "learning_rate": 2.7345587720677026e-07, + "loss": 0.0, + "step": 3034 + }, + { + "epoch": 2.692529372644646, + "grad_norm": 0.011130993254482746, + "learning_rate": 2.7189645582972155e-07, + "loss": 0.0, + "step": 3035 + }, + { + "epoch": 2.6934160939924627, + "grad_norm": 0.1115150898694992, + "learning_rate": 2.703413693042328e-07, + "loss": 0.0002, + "step": 3036 + }, + { + "epoch": 2.6943028153402793, + "grad_norm": 0.1770944446325302, + "learning_rate": 2.6879061905604963e-07, + "loss": 0.001, + "step": 3037 + }, + { + "epoch": 2.695189536688096, + "grad_norm": 0.0014840234071016312, + "learning_rate": 2.672442065069392e-07, + "loss": 0.0, + "step": 3038 + }, + { + "epoch": 2.6960762580359123, + "grad_norm": 0.002955073956400156, + "learning_rate": 2.657021330746956e-07, + "loss": 0.0, + "step": 3039 + }, + { + "epoch": 2.696962979383729, + "grad_norm": 0.0011210017837584019, + "learning_rate": 2.641644001731325e-07, + "loss": 0.0, + "step": 3040 + }, + { + "epoch": 2.697849700731545, + "grad_norm": 0.0069196210242807865, + "learning_rate": 2.6263100921208484e-07, + "loss": 0.0, + "step": 3041 + }, + { + "epoch": 2.6987364220793615, + "grad_norm": 0.0022021131590008736, + "learning_rate": 2.611019615974042e-07, + "loss": 0.0, + "step": 3042 + }, + { + "epoch": 2.699623143427178, + "grad_norm": 0.03699037432670593, + "learning_rate": 2.5957725873096583e-07, + "loss": 0.0001, + "step": 3043 + }, + { + "epoch": 2.7005098647749946, + "grad_norm": 0.031189635396003723, + "learning_rate": 2.5805690201065494e-07, + "loss": 0.0, + "step": 3044 + }, + { + "epoch": 2.7013965861228106, + "grad_norm": 0.35749873518943787, + "learning_rate": 2.565408928303775e-07, + "loss": 0.0026, + "step": 3045 + }, + { + "epoch": 2.702283307470627, + "grad_norm": 0.013324571773409843, + "learning_rate": 2.5502923258004853e-07, + "loss": 0.0001, + "step": 3046 + }, + { + "epoch": 2.7031700288184437, + "grad_norm": 0.012070167809724808, + "learning_rate": 2.535219226456015e-07, + "loss": 0.0, + "step": 3047 + }, + { + "epoch": 2.7040567501662602, + "grad_norm": 0.004928046837449074, + "learning_rate": 2.520189644089766e-07, + "loss": 0.0, + "step": 3048 + }, + { + "epoch": 2.7049434715140768, + "grad_norm": 0.004704819060862064, + "learning_rate": 2.505203592481259e-07, + "loss": 0.0, + "step": 3049 + }, + { + "epoch": 2.7058301928618933, + "grad_norm": 0.045247048139572144, + "learning_rate": 2.49026108537011e-07, + "loss": 0.0002, + "step": 3050 + }, + { + "epoch": 2.70671691420971, + "grad_norm": 0.022802405059337616, + "learning_rate": 2.4753621364560166e-07, + "loss": 0.0001, + "step": 3051 + }, + { + "epoch": 2.707603635557526, + "grad_norm": 0.02606274001300335, + "learning_rate": 2.4605067593987286e-07, + "loss": 0.0001, + "step": 3052 + }, + { + "epoch": 2.7084903569053425, + "grad_norm": 0.013050327077507973, + "learning_rate": 2.445694967818041e-07, + "loss": 0.0, + "step": 3053 + }, + { + "epoch": 2.709377078253159, + "grad_norm": 0.000353116454789415, + "learning_rate": 2.4309267752938126e-07, + "loss": 0.0, + "step": 3054 + }, + { + "epoch": 2.7102637996009755, + "grad_norm": 0.0003638343187049031, + "learning_rate": 2.416202195365913e-07, + "loss": 0.0, + "step": 3055 + }, + { + "epoch": 2.7111505209487916, + "grad_norm": 0.0013948410050943494, + "learning_rate": 2.401521241534227e-07, + "loss": 0.0, + "step": 3056 + }, + { + "epoch": 2.712037242296608, + "grad_norm": 0.0002752837899606675, + "learning_rate": 2.3868839272586485e-07, + "loss": 0.0, + "step": 3057 + }, + { + "epoch": 2.7129239636444247, + "grad_norm": 0.01548377051949501, + "learning_rate": 2.3722902659590653e-07, + "loss": 0.0001, + "step": 3058 + }, + { + "epoch": 2.713810684992241, + "grad_norm": 0.1762523651123047, + "learning_rate": 2.357740271015324e-07, + "loss": 0.0004, + "step": 3059 + }, + { + "epoch": 2.7146974063400577, + "grad_norm": 0.0014459047233685851, + "learning_rate": 2.343233955767249e-07, + "loss": 0.0, + "step": 3060 + }, + { + "epoch": 2.7155841276878743, + "grad_norm": 0.025118915364146233, + "learning_rate": 2.3287713335146177e-07, + "loss": 0.0001, + "step": 3061 + }, + { + "epoch": 2.716470849035691, + "grad_norm": 0.4808674156665802, + "learning_rate": 2.3143524175171628e-07, + "loss": 0.0008, + "step": 3062 + }, + { + "epoch": 2.717357570383507, + "grad_norm": 0.005553117021918297, + "learning_rate": 2.2999772209945037e-07, + "loss": 0.0, + "step": 3063 + }, + { + "epoch": 2.7182442917313234, + "grad_norm": 0.010888897813856602, + "learning_rate": 2.2856457571262314e-07, + "loss": 0.0, + "step": 3064 + }, + { + "epoch": 2.71913101307914, + "grad_norm": 0.14666052162647247, + "learning_rate": 2.271358039051802e-07, + "loss": 0.0004, + "step": 3065 + }, + { + "epoch": 2.7200177344269565, + "grad_norm": 0.13892030715942383, + "learning_rate": 2.2571140798705816e-07, + "loss": 0.0003, + "step": 3066 + }, + { + "epoch": 2.7209044557747726, + "grad_norm": 0.0004430158587638289, + "learning_rate": 2.2429138926417959e-07, + "loss": 0.0, + "step": 3067 + }, + { + "epoch": 2.721791177122589, + "grad_norm": 0.024874933063983917, + "learning_rate": 2.2287574903845866e-07, + "loss": 0.0, + "step": 3068 + }, + { + "epoch": 2.7226778984704056, + "grad_norm": 0.007251809351146221, + "learning_rate": 2.2146448860778934e-07, + "loss": 0.0, + "step": 3069 + }, + { + "epoch": 2.723564619818222, + "grad_norm": 0.011288401670753956, + "learning_rate": 2.200576092660539e-07, + "loss": 0.0, + "step": 3070 + }, + { + "epoch": 2.7244513411660387, + "grad_norm": 0.02641960419714451, + "learning_rate": 2.186551123031172e-07, + "loss": 0.0001, + "step": 3071 + }, + { + "epoch": 2.7253380625138552, + "grad_norm": 0.021034937351942062, + "learning_rate": 2.1725699900482567e-07, + "loss": 0.0001, + "step": 3072 + }, + { + "epoch": 2.7262247838616713, + "grad_norm": 0.022156720981001854, + "learning_rate": 2.1586327065300672e-07, + "loss": 0.0, + "step": 3073 + }, + { + "epoch": 2.727111505209488, + "grad_norm": 0.028590481728315353, + "learning_rate": 2.1447392852546766e-07, + "loss": 0.0001, + "step": 3074 + }, + { + "epoch": 2.7279982265573044, + "grad_norm": 0.005033856723457575, + "learning_rate": 2.130889738959946e-07, + "loss": 0.0, + "step": 3075 + }, + { + "epoch": 2.728884947905121, + "grad_norm": 0.2219175547361374, + "learning_rate": 2.1170840803435066e-07, + "loss": 0.0006, + "step": 3076 + }, + { + "epoch": 2.729771669252937, + "grad_norm": 0.00024738299543969333, + "learning_rate": 2.1033223220627563e-07, + "loss": 0.0, + "step": 3077 + }, + { + "epoch": 2.7306583906007535, + "grad_norm": 0.06527652591466904, + "learning_rate": 2.0896044767348468e-07, + "loss": 0.0003, + "step": 3078 + }, + { + "epoch": 2.73154511194857, + "grad_norm": 0.0066484929993748665, + "learning_rate": 2.075930556936667e-07, + "loss": 0.0, + "step": 3079 + }, + { + "epoch": 2.7324318332963866, + "grad_norm": 0.012028141878545284, + "learning_rate": 2.062300575204823e-07, + "loss": 0.0, + "step": 3080 + }, + { + "epoch": 2.733318554644203, + "grad_norm": 0.013649419881403446, + "learning_rate": 2.048714544035646e-07, + "loss": 0.0, + "step": 3081 + }, + { + "epoch": 2.7342052759920197, + "grad_norm": 0.01976783014833927, + "learning_rate": 2.0351724758851776e-07, + "loss": 0.0001, + "step": 3082 + }, + { + "epoch": 2.735091997339836, + "grad_norm": 0.07697169482707977, + "learning_rate": 2.0216743831691477e-07, + "loss": 0.0002, + "step": 3083 + }, + { + "epoch": 2.7359787186876523, + "grad_norm": 0.006745744496583939, + "learning_rate": 2.0082202782629566e-07, + "loss": 0.0, + "step": 3084 + }, + { + "epoch": 2.736865440035469, + "grad_norm": 0.0004296990518923849, + "learning_rate": 1.9948101735017045e-07, + "loss": 0.0, + "step": 3085 + }, + { + "epoch": 2.7377521613832854, + "grad_norm": 0.09031447768211365, + "learning_rate": 1.9814440811801228e-07, + "loss": 0.0004, + "step": 3086 + }, + { + "epoch": 2.738638882731102, + "grad_norm": 0.0685887336730957, + "learning_rate": 1.968122013552609e-07, + "loss": 0.0003, + "step": 3087 + }, + { + "epoch": 2.739525604078918, + "grad_norm": 0.0008473261841572821, + "learning_rate": 1.954843982833171e-07, + "loss": 0.0, + "step": 3088 + }, + { + "epoch": 2.7404123254267345, + "grad_norm": 0.37110695242881775, + "learning_rate": 1.9416100011954874e-07, + "loss": 0.0018, + "step": 3089 + }, + { + "epoch": 2.741299046774551, + "grad_norm": 0.0020779715850949287, + "learning_rate": 1.9284200807728083e-07, + "loss": 0.0, + "step": 3090 + }, + { + "epoch": 2.7421857681223676, + "grad_norm": 0.015576466917991638, + "learning_rate": 1.9152742336580155e-07, + "loss": 0.0001, + "step": 3091 + }, + { + "epoch": 2.743072489470184, + "grad_norm": 0.016060447320342064, + "learning_rate": 1.902172471903563e-07, + "loss": 0.0, + "step": 3092 + }, + { + "epoch": 2.7439592108180006, + "grad_norm": 0.002036657417193055, + "learning_rate": 1.8891148075215137e-07, + "loss": 0.0, + "step": 3093 + }, + { + "epoch": 2.7448459321658167, + "grad_norm": 0.0033351199235767126, + "learning_rate": 1.8761012524834588e-07, + "loss": 0.0, + "step": 3094 + }, + { + "epoch": 2.7457326535136333, + "grad_norm": 0.008396641351282597, + "learning_rate": 1.8631318187205817e-07, + "loss": 0.0, + "step": 3095 + }, + { + "epoch": 2.74661937486145, + "grad_norm": 0.02515893056988716, + "learning_rate": 1.850206518123615e-07, + "loss": 0.0, + "step": 3096 + }, + { + "epoch": 2.7475060962092663, + "grad_norm": 0.0037924533244222403, + "learning_rate": 1.8373253625428133e-07, + "loss": 0.0, + "step": 3097 + }, + { + "epoch": 2.7483928175570824, + "grad_norm": 0.0001926338445628062, + "learning_rate": 1.8244883637879518e-07, + "loss": 0.0, + "step": 3098 + }, + { + "epoch": 2.749279538904899, + "grad_norm": 0.014933107420802116, + "learning_rate": 1.8116955336283603e-07, + "loss": 0.0001, + "step": 3099 + }, + { + "epoch": 2.7501662602527155, + "grad_norm": 0.015284646302461624, + "learning_rate": 1.7989468837928237e-07, + "loss": 0.0001, + "step": 3100 + }, + { + "epoch": 2.751052981600532, + "grad_norm": 0.016768403351306915, + "learning_rate": 1.7862424259696532e-07, + "loss": 0.0001, + "step": 3101 + }, + { + "epoch": 2.7519397029483486, + "grad_norm": 0.001939128735102713, + "learning_rate": 1.773582171806637e-07, + "loss": 0.0, + "step": 3102 + }, + { + "epoch": 2.752826424296165, + "grad_norm": 0.0016595263732597232, + "learning_rate": 1.7609661329110295e-07, + "loss": 0.0, + "step": 3103 + }, + { + "epoch": 2.7537131456439816, + "grad_norm": 0.0015289318980649114, + "learning_rate": 1.7483943208495558e-07, + "loss": 0.0, + "step": 3104 + }, + { + "epoch": 2.7545998669917977, + "grad_norm": 0.09125112742185593, + "learning_rate": 1.735866747148385e-07, + "loss": 0.0004, + "step": 3105 + }, + { + "epoch": 2.7554865883396142, + "grad_norm": 0.03696690499782562, + "learning_rate": 1.7233834232931345e-07, + "loss": 0.0001, + "step": 3106 + }, + { + "epoch": 2.7563733096874308, + "grad_norm": 0.0077039701864123344, + "learning_rate": 1.7109443607288446e-07, + "loss": 0.0, + "step": 3107 + }, + { + "epoch": 2.7572600310352473, + "grad_norm": 0.0009759148233570158, + "learning_rate": 1.6985495708599863e-07, + "loss": 0.0, + "step": 3108 + }, + { + "epoch": 2.7581467523830634, + "grad_norm": 0.0027849047910422087, + "learning_rate": 1.6861990650504256e-07, + "loss": 0.0, + "step": 3109 + }, + { + "epoch": 2.75903347373088, + "grad_norm": 0.0005394841427914798, + "learning_rate": 1.6738928546234435e-07, + "loss": 0.0, + "step": 3110 + }, + { + "epoch": 2.7599201950786965, + "grad_norm": 0.027912667021155357, + "learning_rate": 1.6616309508617033e-07, + "loss": 0.0001, + "step": 3111 + }, + { + "epoch": 2.760806916426513, + "grad_norm": 0.034508995711803436, + "learning_rate": 1.6494133650072352e-07, + "loss": 0.0001, + "step": 3112 + }, + { + "epoch": 2.7616936377743295, + "grad_norm": 0.2373371124267578, + "learning_rate": 1.6372401082614564e-07, + "loss": 0.0012, + "step": 3113 + }, + { + "epoch": 2.762580359122146, + "grad_norm": 0.0013953030575066805, + "learning_rate": 1.6251111917851391e-07, + "loss": 0.0, + "step": 3114 + }, + { + "epoch": 2.7634670804699626, + "grad_norm": 0.0005270761903375387, + "learning_rate": 1.6130266266983886e-07, + "loss": 0.0, + "step": 3115 + }, + { + "epoch": 2.7643538018177787, + "grad_norm": 0.0027937826234847307, + "learning_rate": 1.6009864240806528e-07, + "loss": 0.0, + "step": 3116 + }, + { + "epoch": 2.765240523165595, + "grad_norm": 0.03274468332529068, + "learning_rate": 1.5889905949707186e-07, + "loss": 0.0001, + "step": 3117 + }, + { + "epoch": 2.7661272445134117, + "grad_norm": 0.0675559788942337, + "learning_rate": 1.5770391503666883e-07, + "loss": 0.0001, + "step": 3118 + }, + { + "epoch": 2.7670139658612283, + "grad_norm": 0.005557031370699406, + "learning_rate": 1.5651321012259525e-07, + "loss": 0.0, + "step": 3119 + }, + { + "epoch": 2.7679006872090444, + "grad_norm": 0.08115627616643906, + "learning_rate": 1.5532694584652175e-07, + "loss": 0.0002, + "step": 3120 + }, + { + "epoch": 2.768787408556861, + "grad_norm": 0.0014597626868635416, + "learning_rate": 1.541451232960467e-07, + "loss": 0.0, + "step": 3121 + }, + { + "epoch": 2.7696741299046774, + "grad_norm": 0.13055862486362457, + "learning_rate": 1.5296774355469778e-07, + "loss": 0.0005, + "step": 3122 + }, + { + "epoch": 2.770560851252494, + "grad_norm": 0.009585856460034847, + "learning_rate": 1.5179480770192602e-07, + "loss": 0.0, + "step": 3123 + }, + { + "epoch": 2.7714475726003105, + "grad_norm": 0.00751367025077343, + "learning_rate": 1.506263168131128e-07, + "loss": 0.0, + "step": 3124 + }, + { + "epoch": 2.772334293948127, + "grad_norm": 0.03164404258131981, + "learning_rate": 1.4946227195956008e-07, + "loss": 0.0001, + "step": 3125 + }, + { + "epoch": 2.773221015295943, + "grad_norm": 0.25857213139533997, + "learning_rate": 1.4830267420849587e-07, + "loss": 0.0027, + "step": 3126 + }, + { + "epoch": 2.7741077366437596, + "grad_norm": 0.000911539129447192, + "learning_rate": 1.4714752462307025e-07, + "loss": 0.0, + "step": 3127 + }, + { + "epoch": 2.774994457991576, + "grad_norm": 0.006078448612242937, + "learning_rate": 1.4599682426235552e-07, + "loss": 0.0, + "step": 3128 + }, + { + "epoch": 2.7758811793393927, + "grad_norm": 0.0028333021327853203, + "learning_rate": 1.4485057418134385e-07, + "loss": 0.0, + "step": 3129 + }, + { + "epoch": 2.776767900687209, + "grad_norm": 0.007403951603919268, + "learning_rate": 1.437087754309485e-07, + "loss": 0.0, + "step": 3130 + }, + { + "epoch": 2.7776546220350253, + "grad_norm": 0.014319689944386482, + "learning_rate": 1.4257142905800048e-07, + "loss": 0.0, + "step": 3131 + }, + { + "epoch": 2.778541343382842, + "grad_norm": 0.0011654490372166038, + "learning_rate": 1.414385361052495e-07, + "loss": 0.0, + "step": 3132 + }, + { + "epoch": 2.7794280647306584, + "grad_norm": 0.0013293580850586295, + "learning_rate": 1.403100976113625e-07, + "loss": 0.0, + "step": 3133 + }, + { + "epoch": 2.780314786078475, + "grad_norm": 0.002857569605112076, + "learning_rate": 1.3918611461092135e-07, + "loss": 0.0, + "step": 3134 + }, + { + "epoch": 2.7812015074262915, + "grad_norm": 0.5601361989974976, + "learning_rate": 1.3806658813442453e-07, + "loss": 0.0047, + "step": 3135 + }, + { + "epoch": 2.782088228774108, + "grad_norm": 0.3579235076904297, + "learning_rate": 1.3695151920828265e-07, + "loss": 0.001, + "step": 3136 + }, + { + "epoch": 2.782974950121924, + "grad_norm": 0.04795952886343002, + "learning_rate": 1.3584090885482183e-07, + "loss": 0.0, + "step": 3137 + }, + { + "epoch": 2.7838616714697406, + "grad_norm": 0.027433745563030243, + "learning_rate": 1.347347580922781e-07, + "loss": 0.0001, + "step": 3138 + }, + { + "epoch": 2.784748392817557, + "grad_norm": 0.1161801666021347, + "learning_rate": 1.3363306793480191e-07, + "loss": 0.0006, + "step": 3139 + }, + { + "epoch": 2.7856351141653737, + "grad_norm": 0.003158286912366748, + "learning_rate": 1.3253583939245028e-07, + "loss": 0.0, + "step": 3140 + }, + { + "epoch": 2.7865218355131898, + "grad_norm": 0.2493053823709488, + "learning_rate": 1.314430734711919e-07, + "loss": 0.0043, + "step": 3141 + }, + { + "epoch": 2.7874085568610063, + "grad_norm": 0.02230064757168293, + "learning_rate": 1.3035477117290472e-07, + "loss": 0.0001, + "step": 3142 + }, + { + "epoch": 2.788295278208823, + "grad_norm": 0.026541607454419136, + "learning_rate": 1.292709334953729e-07, + "loss": 0.0001, + "step": 3143 + }, + { + "epoch": 2.7891819995566394, + "grad_norm": 0.013466564007103443, + "learning_rate": 1.2819156143228706e-07, + "loss": 0.0, + "step": 3144 + }, + { + "epoch": 2.790068720904456, + "grad_norm": 0.0019223958952352405, + "learning_rate": 1.2711665597324563e-07, + "loss": 0.0, + "step": 3145 + }, + { + "epoch": 2.7909554422522724, + "grad_norm": 0.04089783504605293, + "learning_rate": 1.260462181037492e-07, + "loss": 0.0001, + "step": 3146 + }, + { + "epoch": 2.7918421636000885, + "grad_norm": 0.009505347348749638, + "learning_rate": 1.2498024880520543e-07, + "loss": 0.0, + "step": 3147 + }, + { + "epoch": 2.792728884947905, + "grad_norm": 0.001099666696973145, + "learning_rate": 1.239187490549215e-07, + "loss": 0.0, + "step": 3148 + }, + { + "epoch": 2.7936156062957216, + "grad_norm": 0.0020217658020555973, + "learning_rate": 1.2286171982611062e-07, + "loss": 0.0, + "step": 3149 + }, + { + "epoch": 2.794502327643538, + "grad_norm": 0.008771456778049469, + "learning_rate": 1.2180916208788418e-07, + "loss": 0.0, + "step": 3150 + }, + { + "epoch": 2.795389048991354, + "grad_norm": 0.005936725065112114, + "learning_rate": 1.2076107680525529e-07, + "loss": 0.0, + "step": 3151 + }, + { + "epoch": 2.7962757703391707, + "grad_norm": 0.09092960506677628, + "learning_rate": 1.1971746493913693e-07, + "loss": 0.0003, + "step": 3152 + }, + { + "epoch": 2.7971624916869873, + "grad_norm": 0.2639486491680145, + "learning_rate": 1.1867832744633989e-07, + "loss": 0.0014, + "step": 3153 + }, + { + "epoch": 2.798049213034804, + "grad_norm": 0.0012152879498898983, + "learning_rate": 1.1764366527957427e-07, + "loss": 0.0, + "step": 3154 + }, + { + "epoch": 2.7989359343826203, + "grad_norm": 0.0016898787580430508, + "learning_rate": 1.166134793874446e-07, + "loss": 0.0, + "step": 3155 + }, + { + "epoch": 2.799822655730437, + "grad_norm": 0.000665109371766448, + "learning_rate": 1.1558777071445371e-07, + "loss": 0.0, + "step": 3156 + }, + { + "epoch": 2.8007093770782534, + "grad_norm": 0.16242636740207672, + "learning_rate": 1.1456654020099767e-07, + "loss": 0.0006, + "step": 3157 + }, + { + "epoch": 2.8015960984260695, + "grad_norm": 0.001742783235386014, + "learning_rate": 1.135497887833692e-07, + "loss": 0.0, + "step": 3158 + }, + { + "epoch": 2.802482819773886, + "grad_norm": 0.013286867178976536, + "learning_rate": 1.1253751739375207e-07, + "loss": 0.0, + "step": 3159 + }, + { + "epoch": 2.8033695411217026, + "grad_norm": 0.005383123643696308, + "learning_rate": 1.1152972696022447e-07, + "loss": 0.0, + "step": 3160 + }, + { + "epoch": 2.804256262469519, + "grad_norm": 0.0018168076639994979, + "learning_rate": 1.105264184067556e-07, + "loss": 0.0, + "step": 3161 + }, + { + "epoch": 2.805142983817335, + "grad_norm": 0.001898395363241434, + "learning_rate": 1.0952759265320412e-07, + "loss": 0.0, + "step": 3162 + }, + { + "epoch": 2.8060297051651517, + "grad_norm": 0.03626805543899536, + "learning_rate": 1.0853325061532194e-07, + "loss": 0.0002, + "step": 3163 + }, + { + "epoch": 2.8069164265129682, + "grad_norm": 0.18261322379112244, + "learning_rate": 1.0754339320474816e-07, + "loss": 0.0011, + "step": 3164 + }, + { + "epoch": 2.8078031478607848, + "grad_norm": 0.0031685426365584135, + "learning_rate": 1.065580213290085e-07, + "loss": 0.0, + "step": 3165 + }, + { + "epoch": 2.8086898692086013, + "grad_norm": 0.003848884254693985, + "learning_rate": 1.0557713589152086e-07, + "loss": 0.0, + "step": 3166 + }, + { + "epoch": 2.809576590556418, + "grad_norm": 0.010794474743306637, + "learning_rate": 1.0460073779158586e-07, + "loss": 0.0, + "step": 3167 + }, + { + "epoch": 2.810463311904234, + "grad_norm": 0.003852618858218193, + "learning_rate": 1.0362882792439189e-07, + "loss": 0.0, + "step": 3168 + }, + { + "epoch": 2.8113500332520505, + "grad_norm": 0.006770124193280935, + "learning_rate": 1.0266140718101003e-07, + "loss": 0.0, + "step": 3169 + }, + { + "epoch": 2.812236754599867, + "grad_norm": 0.012775304727256298, + "learning_rate": 1.0169847644840026e-07, + "loss": 0.0001, + "step": 3170 + }, + { + "epoch": 2.8131234759476835, + "grad_norm": 0.06564140319824219, + "learning_rate": 1.0074003660940135e-07, + "loss": 0.0002, + "step": 3171 + }, + { + "epoch": 2.8140101972954996, + "grad_norm": 0.0029760506004095078, + "learning_rate": 9.978608854273708e-08, + "loss": 0.0, + "step": 3172 + }, + { + "epoch": 2.814896918643316, + "grad_norm": 0.002229688921943307, + "learning_rate": 9.883663312301228e-08, + "loss": 0.0, + "step": 3173 + }, + { + "epoch": 2.8157836399911327, + "grad_norm": 0.006304146256297827, + "learning_rate": 9.789167122071341e-08, + "loss": 0.0, + "step": 3174 + }, + { + "epoch": 2.816670361338949, + "grad_norm": 0.0052603622898459435, + "learning_rate": 9.695120370220634e-08, + "loss": 0.0, + "step": 3175 + }, + { + "epoch": 2.8175570826867657, + "grad_norm": 0.0849158987402916, + "learning_rate": 9.60152314297369e-08, + "loss": 0.0003, + "step": 3176 + }, + { + "epoch": 2.8184438040345823, + "grad_norm": 0.088808573782444, + "learning_rate": 9.508375526142976e-08, + "loss": 0.0005, + "step": 3177 + }, + { + "epoch": 2.819330525382399, + "grad_norm": 0.0012514623813331127, + "learning_rate": 9.415677605128681e-08, + "loss": 0.0, + "step": 3178 + }, + { + "epoch": 2.820217246730215, + "grad_norm": 0.005674469750374556, + "learning_rate": 9.323429464918766e-08, + "loss": 0.0, + "step": 3179 + }, + { + "epoch": 2.8211039680780314, + "grad_norm": 0.00882049836218357, + "learning_rate": 9.231631190088742e-08, + "loss": 0.0, + "step": 3180 + }, + { + "epoch": 2.821990689425848, + "grad_norm": 0.0023833143059164286, + "learning_rate": 9.140282864801786e-08, + "loss": 0.0, + "step": 3181 + }, + { + "epoch": 2.8228774107736645, + "grad_norm": 0.07122648507356644, + "learning_rate": 9.049384572808407e-08, + "loss": 0.0003, + "step": 3182 + }, + { + "epoch": 2.8237641321214806, + "grad_norm": 0.010699799284338951, + "learning_rate": 8.958936397446605e-08, + "loss": 0.0, + "step": 3183 + }, + { + "epoch": 2.824650853469297, + "grad_norm": 0.005035745445638895, + "learning_rate": 8.868938421641715e-08, + "loss": 0.0, + "step": 3184 + }, + { + "epoch": 2.8255375748171137, + "grad_norm": 0.0031189401634037495, + "learning_rate": 8.77939072790629e-08, + "loss": 0.0, + "step": 3185 + }, + { + "epoch": 2.82642429616493, + "grad_norm": 0.007225423585623503, + "learning_rate": 8.690293398339933e-08, + "loss": 0.0, + "step": 3186 + }, + { + "epoch": 2.8273110175127467, + "grad_norm": 0.019024573266506195, + "learning_rate": 8.601646514629635e-08, + "loss": 0.0001, + "step": 3187 + }, + { + "epoch": 2.8281977388605632, + "grad_norm": 0.0028227169532328844, + "learning_rate": 8.513450158049109e-08, + "loss": 0.0, + "step": 3188 + }, + { + "epoch": 2.82908446020838, + "grad_norm": 0.04316164553165436, + "learning_rate": 8.425704409459168e-08, + "loss": 0.0001, + "step": 3189 + }, + { + "epoch": 2.829971181556196, + "grad_norm": 0.21143779158592224, + "learning_rate": 8.338409349307409e-08, + "loss": 0.0012, + "step": 3190 + }, + { + "epoch": 2.8308579029040124, + "grad_norm": 0.007404598407447338, + "learning_rate": 8.251565057628364e-08, + "loss": 0.0, + "step": 3191 + }, + { + "epoch": 2.831744624251829, + "grad_norm": 0.03625713661313057, + "learning_rate": 8.165171614043066e-08, + "loss": 0.0001, + "step": 3192 + }, + { + "epoch": 2.8326313455996455, + "grad_norm": 0.0007035653688944876, + "learning_rate": 8.079229097759433e-08, + "loss": 0.0, + "step": 3193 + }, + { + "epoch": 2.8335180669474616, + "grad_norm": 0.04336908459663391, + "learning_rate": 7.993737587571825e-08, + "loss": 0.0001, + "step": 3194 + }, + { + "epoch": 2.834404788295278, + "grad_norm": 0.0009928614599630237, + "learning_rate": 7.908697161861212e-08, + "loss": 0.0, + "step": 3195 + }, + { + "epoch": 2.8352915096430946, + "grad_norm": 0.05311229079961777, + "learning_rate": 7.824107898594835e-08, + "loss": 0.0001, + "step": 3196 + }, + { + "epoch": 2.836178230990911, + "grad_norm": 0.000616305333096534, + "learning_rate": 7.739969875326381e-08, + "loss": 0.0, + "step": 3197 + }, + { + "epoch": 2.8370649523387277, + "grad_norm": 0.0035000985953956842, + "learning_rate": 7.656283169195866e-08, + "loss": 0.0, + "step": 3198 + }, + { + "epoch": 2.837951673686544, + "grad_norm": 0.10979343205690384, + "learning_rate": 7.573047856929582e-08, + "loss": 0.0002, + "step": 3199 + }, + { + "epoch": 2.8388383950343603, + "grad_norm": 0.036622680723667145, + "learning_rate": 7.490264014839654e-08, + "loss": 0.0001, + "step": 3200 + }, + { + "epoch": 2.839725116382177, + "grad_norm": 0.03518826141953468, + "learning_rate": 7.407931718824812e-08, + "loss": 0.0001, + "step": 3201 + }, + { + "epoch": 2.8406118377299934, + "grad_norm": 0.45864933729171753, + "learning_rate": 7.326051044369343e-08, + "loss": 0.0083, + "step": 3202 + }, + { + "epoch": 2.84149855907781, + "grad_norm": 0.0005170101649127901, + "learning_rate": 7.244622066543582e-08, + "loss": 0.0, + "step": 3203 + }, + { + "epoch": 2.842385280425626, + "grad_norm": 0.0011105621233582497, + "learning_rate": 7.163644860003927e-08, + "loss": 0.0, + "step": 3204 + }, + { + "epoch": 2.8432720017734425, + "grad_norm": 0.10616616904735565, + "learning_rate": 7.083119498992319e-08, + "loss": 0.0007, + "step": 3205 + }, + { + "epoch": 2.844158723121259, + "grad_norm": 0.0022827358916401863, + "learning_rate": 7.003046057336704e-08, + "loss": 0.0, + "step": 3206 + }, + { + "epoch": 2.8450454444690756, + "grad_norm": 0.2717021703720093, + "learning_rate": 6.923424608450358e-08, + "loss": 0.0003, + "step": 3207 + }, + { + "epoch": 2.845932165816892, + "grad_norm": 0.4444045424461365, + "learning_rate": 6.844255225332552e-08, + "loss": 0.0028, + "step": 3208 + }, + { + "epoch": 2.8468188871647087, + "grad_norm": 0.09726931899785995, + "learning_rate": 6.765537980567727e-08, + "loss": 0.0004, + "step": 3209 + }, + { + "epoch": 2.847705608512525, + "grad_norm": 0.006602408364415169, + "learning_rate": 6.687272946326096e-08, + "loss": 0.0, + "step": 3210 + }, + { + "epoch": 2.8485923298603413, + "grad_norm": 0.0016501571517437696, + "learning_rate": 6.609460194362927e-08, + "loss": 0.0, + "step": 3211 + }, + { + "epoch": 2.849479051208158, + "grad_norm": 0.012879370711743832, + "learning_rate": 6.53209979601932e-08, + "loss": 0.0, + "step": 3212 + }, + { + "epoch": 2.8503657725559743, + "grad_norm": 0.0010488486150279641, + "learning_rate": 6.455191822221097e-08, + "loss": 0.0, + "step": 3213 + }, + { + "epoch": 2.851252493903791, + "grad_norm": 0.0028409042861312628, + "learning_rate": 6.37873634347963e-08, + "loss": 0.0, + "step": 3214 + }, + { + "epoch": 2.852139215251607, + "grad_norm": 0.012911890633404255, + "learning_rate": 6.302733429891406e-08, + "loss": 0.0, + "step": 3215 + }, + { + "epoch": 2.8530259365994235, + "grad_norm": 0.11188660562038422, + "learning_rate": 6.22718315113785e-08, + "loss": 0.0003, + "step": 3216 + }, + { + "epoch": 2.85391265794724, + "grad_norm": 0.1650157868862152, + "learning_rate": 6.1520855764855e-08, + "loss": 0.0008, + "step": 3217 + }, + { + "epoch": 2.8547993792950566, + "grad_norm": 0.018146656453609467, + "learning_rate": 6.077440774785837e-08, + "loss": 0.0, + "step": 3218 + }, + { + "epoch": 2.855686100642873, + "grad_norm": 0.003937463741749525, + "learning_rate": 6.003248814475116e-08, + "loss": 0.0, + "step": 3219 + }, + { + "epoch": 2.8565728219906896, + "grad_norm": 0.012310229241847992, + "learning_rate": 5.929509763574648e-08, + "loss": 0.0, + "step": 3220 + }, + { + "epoch": 2.8574595433385057, + "grad_norm": 0.00288616050966084, + "learning_rate": 5.8562236896902437e-08, + "loss": 0.0, + "step": 3221 + }, + { + "epoch": 2.8583462646863222, + "grad_norm": 0.0060603683814406395, + "learning_rate": 5.7833906600125996e-08, + "loss": 0.0, + "step": 3222 + }, + { + "epoch": 2.859232986034139, + "grad_norm": 0.03721298649907112, + "learning_rate": 5.711010741316969e-08, + "loss": 0.0001, + "step": 3223 + }, + { + "epoch": 2.8601197073819553, + "grad_norm": 0.0499625988304615, + "learning_rate": 5.6390839999631574e-08, + "loss": 0.0001, + "step": 3224 + }, + { + "epoch": 2.8610064287297714, + "grad_norm": 0.04375501722097397, + "learning_rate": 5.567610501895526e-08, + "loss": 0.0001, + "step": 3225 + }, + { + "epoch": 2.861893150077588, + "grad_norm": 0.002494650427252054, + "learning_rate": 5.496590312642991e-08, + "loss": 0.0, + "step": 3226 + }, + { + "epoch": 2.8627798714254045, + "grad_norm": 0.011626002378761768, + "learning_rate": 5.4260234973186335e-08, + "loss": 0.0, + "step": 3227 + }, + { + "epoch": 2.863666592773221, + "grad_norm": 0.042777977883815765, + "learning_rate": 5.3559101206200337e-08, + "loss": 0.0001, + "step": 3228 + }, + { + "epoch": 2.8645533141210375, + "grad_norm": 0.0015777841908857226, + "learning_rate": 5.286250246828994e-08, + "loss": 0.0, + "step": 3229 + }, + { + "epoch": 2.865440035468854, + "grad_norm": 0.041069697588682175, + "learning_rate": 5.217043939811595e-08, + "loss": 0.0001, + "step": 3230 + }, + { + "epoch": 2.8663267568166706, + "grad_norm": 0.0033635362051427364, + "learning_rate": 5.1482912630180234e-08, + "loss": 0.0, + "step": 3231 + }, + { + "epoch": 2.8672134781644867, + "grad_norm": 0.4834699332714081, + "learning_rate": 5.079992279482471e-08, + "loss": 0.0004, + "step": 3232 + }, + { + "epoch": 2.868100199512303, + "grad_norm": 0.23373693227767944, + "learning_rate": 5.012147051823346e-08, + "loss": 0.0017, + "step": 3233 + }, + { + "epoch": 2.8689869208601197, + "grad_norm": 0.008513184264302254, + "learning_rate": 4.9447556422430046e-08, + "loss": 0.0, + "step": 3234 + }, + { + "epoch": 2.8698736422079363, + "grad_norm": 0.009647476486861706, + "learning_rate": 4.877818112527632e-08, + "loss": 0.0, + "step": 3235 + }, + { + "epoch": 2.8707603635557524, + "grad_norm": 0.017421780154109, + "learning_rate": 4.811334524047307e-08, + "loss": 0.0, + "step": 3236 + }, + { + "epoch": 2.871647084903569, + "grad_norm": 0.013581366278231144, + "learning_rate": 4.745304937756101e-08, + "loss": 0.0, + "step": 3237 + }, + { + "epoch": 2.8725338062513854, + "grad_norm": 0.0025726743042469025, + "learning_rate": 4.679729414191536e-08, + "loss": 0.0, + "step": 3238 + }, + { + "epoch": 2.873420527599202, + "grad_norm": 0.005139803979545832, + "learning_rate": 4.614608013475075e-08, + "loss": 0.0, + "step": 3239 + }, + { + "epoch": 2.8743072489470185, + "grad_norm": 0.004231195896863937, + "learning_rate": 4.5499407953117355e-08, + "loss": 0.0, + "step": 3240 + }, + { + "epoch": 2.875193970294835, + "grad_norm": 0.015432806685566902, + "learning_rate": 4.4857278189902046e-08, + "loss": 0.0, + "step": 3241 + }, + { + "epoch": 2.8760806916426516, + "grad_norm": 0.014841458760201931, + "learning_rate": 4.4219691433826094e-08, + "loss": 0.0, + "step": 3242 + }, + { + "epoch": 2.8769674129904677, + "grad_norm": 0.0013239766703918576, + "learning_rate": 4.358664826944636e-08, + "loss": 0.0, + "step": 3243 + }, + { + "epoch": 2.877854134338284, + "grad_norm": 0.009761502966284752, + "learning_rate": 4.295814927715303e-08, + "loss": 0.0, + "step": 3244 + }, + { + "epoch": 2.8787408556861007, + "grad_norm": 0.003177881008014083, + "learning_rate": 4.233419503317182e-08, + "loss": 0.0, + "step": 3245 + }, + { + "epoch": 2.8796275770339173, + "grad_norm": 0.35972318053245544, + "learning_rate": 4.171478610955904e-08, + "loss": 0.0009, + "step": 3246 + }, + { + "epoch": 2.8805142983817333, + "grad_norm": 0.0046821520663797855, + "learning_rate": 4.109992307420707e-08, + "loss": 0.0, + "step": 3247 + }, + { + "epoch": 2.88140101972955, + "grad_norm": 0.02193521521985531, + "learning_rate": 4.048960649083777e-08, + "loss": 0.0001, + "step": 3248 + }, + { + "epoch": 2.8822877410773664, + "grad_norm": 0.16279035806655884, + "learning_rate": 3.9883836919006325e-08, + "loss": 0.0009, + "step": 3249 + }, + { + "epoch": 2.883174462425183, + "grad_norm": 0.04672611132264137, + "learning_rate": 3.928261491409735e-08, + "loss": 0.0001, + "step": 3250 + }, + { + "epoch": 2.8840611837729995, + "grad_norm": 0.12944093346595764, + "learning_rate": 3.868594102732881e-08, + "loss": 0.0006, + "step": 3251 + }, + { + "epoch": 2.884947905120816, + "grad_norm": 0.4571356773376465, + "learning_rate": 3.8093815805745895e-08, + "loss": 0.0018, + "step": 3252 + }, + { + "epoch": 2.885834626468632, + "grad_norm": 0.21273784339427948, + "learning_rate": 3.7506239792225454e-08, + "loss": 0.0013, + "step": 3253 + }, + { + "epoch": 2.8867213478164486, + "grad_norm": 0.11761754006147385, + "learning_rate": 3.6923213525472126e-08, + "loss": 0.0004, + "step": 3254 + }, + { + "epoch": 2.887608069164265, + "grad_norm": 0.005210244562476873, + "learning_rate": 3.63447375400211e-08, + "loss": 0.0, + "step": 3255 + }, + { + "epoch": 2.8884947905120817, + "grad_norm": 0.025401975959539413, + "learning_rate": 3.577081236623425e-08, + "loss": 0.0001, + "step": 3256 + }, + { + "epoch": 2.8893815118598978, + "grad_norm": 0.0018261591903865337, + "learning_rate": 3.5201438530300666e-08, + "loss": 0.0, + "step": 3257 + }, + { + "epoch": 2.8902682332077143, + "grad_norm": 0.002138376934453845, + "learning_rate": 3.463661655423889e-08, + "loss": 0.0, + "step": 3258 + }, + { + "epoch": 2.891154954555531, + "grad_norm": 0.003766312263906002, + "learning_rate": 3.407634695589135e-08, + "loss": 0.0, + "step": 3259 + }, + { + "epoch": 2.8920416759033474, + "grad_norm": 0.0012409350601956248, + "learning_rate": 3.352063024892882e-08, + "loss": 0.0, + "step": 3260 + }, + { + "epoch": 2.892928397251164, + "grad_norm": 0.7102566361427307, + "learning_rate": 3.296946694284764e-08, + "loss": 0.0036, + "step": 3261 + }, + { + "epoch": 2.8938151185989804, + "grad_norm": 0.1386748105287552, + "learning_rate": 3.242285754296859e-08, + "loss": 0.0008, + "step": 3262 + }, + { + "epoch": 2.894701839946797, + "grad_norm": 0.05686341971158981, + "learning_rate": 3.188080255043746e-08, + "loss": 0.0001, + "step": 3263 + }, + { + "epoch": 2.895588561294613, + "grad_norm": 0.011563396081328392, + "learning_rate": 3.1343302462225054e-08, + "loss": 0.0, + "step": 3264 + }, + { + "epoch": 2.8964752826424296, + "grad_norm": 0.0008911413606256247, + "learning_rate": 3.0810357771126064e-08, + "loss": 0.0, + "step": 3265 + }, + { + "epoch": 2.897362003990246, + "grad_norm": 0.0011407394194975495, + "learning_rate": 3.028196896575852e-08, + "loss": 0.0, + "step": 3266 + }, + { + "epoch": 2.8982487253380627, + "grad_norm": 0.0022614169865846634, + "learning_rate": 2.9758136530562143e-08, + "loss": 0.0, + "step": 3267 + }, + { + "epoch": 2.8991354466858787, + "grad_norm": 0.002719403011724353, + "learning_rate": 2.9238860945802193e-08, + "loss": 0.0, + "step": 3268 + }, + { + "epoch": 2.9000221680336953, + "grad_norm": 0.0018850235501304269, + "learning_rate": 2.872414268756285e-08, + "loss": 0.0, + "step": 3269 + }, + { + "epoch": 2.900908889381512, + "grad_norm": 0.0057585276663303375, + "learning_rate": 2.821398222775329e-08, + "loss": 0.0, + "step": 3270 + }, + { + "epoch": 2.9017956107293283, + "grad_norm": 0.00659179175272584, + "learning_rate": 2.7708380034099923e-08, + "loss": 0.0, + "step": 3271 + }, + { + "epoch": 2.902682332077145, + "grad_norm": 0.0016464820364490151, + "learning_rate": 2.720733657015473e-08, + "loss": 0.0, + "step": 3272 + }, + { + "epoch": 2.9035690534249614, + "grad_norm": 0.0004565789131447673, + "learning_rate": 2.6710852295286362e-08, + "loss": 0.0, + "step": 3273 + }, + { + "epoch": 2.9044557747727775, + "grad_norm": 0.012226813472807407, + "learning_rate": 2.6218927664685145e-08, + "loss": 0.0, + "step": 3274 + }, + { + "epoch": 2.905342496120594, + "grad_norm": 0.00505524966865778, + "learning_rate": 2.573156312936087e-08, + "loss": 0.0, + "step": 3275 + }, + { + "epoch": 2.9062292174684106, + "grad_norm": 0.008225847966969013, + "learning_rate": 2.5248759136142222e-08, + "loss": 0.0, + "step": 3276 + }, + { + "epoch": 2.907115938816227, + "grad_norm": 0.008066046983003616, + "learning_rate": 2.4770516127676226e-08, + "loss": 0.0, + "step": 3277 + }, + { + "epoch": 2.908002660164043, + "grad_norm": 0.037920404225587845, + "learning_rate": 2.4296834542429926e-08, + "loss": 0.0, + "step": 3278 + }, + { + "epoch": 2.9088893815118597, + "grad_norm": 0.01949114352464676, + "learning_rate": 2.3827714814686488e-08, + "loss": 0.0001, + "step": 3279 + }, + { + "epoch": 2.9097761028596763, + "grad_norm": 0.00045723363291472197, + "learning_rate": 2.3363157374547417e-08, + "loss": 0.0, + "step": 3280 + }, + { + "epoch": 2.910662824207493, + "grad_norm": 0.0014682360924780369, + "learning_rate": 2.2903162647932022e-08, + "loss": 0.0, + "step": 3281 + }, + { + "epoch": 2.9115495455553093, + "grad_norm": 0.0035071983002126217, + "learning_rate": 2.244773105657516e-08, + "loss": 0.0, + "step": 3282 + }, + { + "epoch": 2.912436266903126, + "grad_norm": 0.0004839412576984614, + "learning_rate": 2.1996863018028945e-08, + "loss": 0.0, + "step": 3283 + }, + { + "epoch": 2.9133229882509424, + "grad_norm": 0.0006948516820557415, + "learning_rate": 2.155055894566105e-08, + "loss": 0.0, + "step": 3284 + }, + { + "epoch": 2.9142097095987585, + "grad_norm": 0.07006675004959106, + "learning_rate": 2.1108819248655266e-08, + "loss": 0.0003, + "step": 3285 + }, + { + "epoch": 2.915096430946575, + "grad_norm": 0.11140327900648117, + "learning_rate": 2.0671644332009854e-08, + "loss": 0.0006, + "step": 3286 + }, + { + "epoch": 2.9159831522943915, + "grad_norm": 0.0038438565097749233, + "learning_rate": 2.0239034596538644e-08, + "loss": 0.0, + "step": 3287 + }, + { + "epoch": 2.916869873642208, + "grad_norm": 0.0013927739346399903, + "learning_rate": 1.9810990438869913e-08, + "loss": 0.0, + "step": 3288 + }, + { + "epoch": 2.917756594990024, + "grad_norm": 0.004284338094294071, + "learning_rate": 1.938751225144586e-08, + "loss": 0.0, + "step": 3289 + }, + { + "epoch": 2.9186433163378407, + "grad_norm": 0.03040909208357334, + "learning_rate": 1.8968600422522577e-08, + "loss": 0.0001, + "step": 3290 + }, + { + "epoch": 2.919530037685657, + "grad_norm": 0.004622400738298893, + "learning_rate": 1.8554255336170056e-08, + "loss": 0.0, + "step": 3291 + }, + { + "epoch": 2.9204167590334738, + "grad_norm": 0.005725715775042772, + "learning_rate": 1.8144477372269988e-08, + "loss": 0.0, + "step": 3292 + }, + { + "epoch": 2.9213034803812903, + "grad_norm": 0.06149230897426605, + "learning_rate": 1.7739266906519058e-08, + "loss": 0.0002, + "step": 3293 + }, + { + "epoch": 2.922190201729107, + "grad_norm": 0.005780136212706566, + "learning_rate": 1.733862431042399e-08, + "loss": 0.0, + "step": 3294 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 0.013424360193312168, + "learning_rate": 1.6942549951304844e-08, + "loss": 0.0, + "step": 3295 + }, + { + "epoch": 2.9239636444247394, + "grad_norm": 0.0061098430305719376, + "learning_rate": 1.655104419229281e-08, + "loss": 0.0, + "step": 3296 + }, + { + "epoch": 2.924850365772556, + "grad_norm": 0.0005447586299851537, + "learning_rate": 1.6164107392331873e-08, + "loss": 0.0, + "step": 3297 + }, + { + "epoch": 2.9257370871203725, + "grad_norm": 0.019417855888605118, + "learning_rate": 1.5781739906175486e-08, + "loss": 0.0, + "step": 3298 + }, + { + "epoch": 2.926623808468189, + "grad_norm": 0.0017337505705654621, + "learning_rate": 1.5403942084387667e-08, + "loss": 0.0, + "step": 3299 + }, + { + "epoch": 2.927510529816005, + "grad_norm": 0.006602004170417786, + "learning_rate": 1.503071427334468e-08, + "loss": 0.0, + "step": 3300 + }, + { + "epoch": 2.9283972511638217, + "grad_norm": 0.011092391796410084, + "learning_rate": 1.466205681523114e-08, + "loss": 0.0001, + "step": 3301 + }, + { + "epoch": 2.929283972511638, + "grad_norm": 0.0026441768277436495, + "learning_rate": 1.4297970048041676e-08, + "loss": 0.0, + "step": 3302 + }, + { + "epoch": 2.9301706938594547, + "grad_norm": 0.00047737229033373296, + "learning_rate": 1.393845430558205e-08, + "loss": 0.0, + "step": 3303 + }, + { + "epoch": 2.9310574152072713, + "grad_norm": 0.011745357885956764, + "learning_rate": 1.3583509917464155e-08, + "loss": 0.0001, + "step": 3304 + }, + { + "epoch": 2.931944136555088, + "grad_norm": 0.0005886032013222575, + "learning_rate": 1.3233137209112123e-08, + "loss": 0.0, + "step": 3305 + }, + { + "epoch": 2.932830857902904, + "grad_norm": 0.039524223655462265, + "learning_rate": 1.2887336501755666e-08, + "loss": 0.0001, + "step": 3306 + }, + { + "epoch": 2.9337175792507204, + "grad_norm": 0.008052697405219078, + "learning_rate": 1.2546108112435062e-08, + "loss": 0.0, + "step": 3307 + }, + { + "epoch": 2.934604300598537, + "grad_norm": 0.003623201046139002, + "learning_rate": 1.2209452353997841e-08, + "loss": 0.0, + "step": 3308 + }, + { + "epoch": 2.9354910219463535, + "grad_norm": 0.0007283863378688693, + "learning_rate": 1.187736953509766e-08, + "loss": 0.0, + "step": 3309 + }, + { + "epoch": 2.9363777432941696, + "grad_norm": 0.13052868843078613, + "learning_rate": 1.154985996019764e-08, + "loss": 0.0003, + "step": 3310 + }, + { + "epoch": 2.937264464641986, + "grad_norm": 0.002719426527619362, + "learning_rate": 1.122692392956759e-08, + "loss": 0.0, + "step": 3311 + }, + { + "epoch": 2.9381511859898026, + "grad_norm": 0.10330086201429367, + "learning_rate": 1.0908561739283452e-08, + "loss": 0.0004, + "step": 3312 + }, + { + "epoch": 2.939037907337619, + "grad_norm": 0.009346824139356613, + "learning_rate": 1.059477368122841e-08, + "loss": 0.0, + "step": 3313 + }, + { + "epoch": 2.9399246286854357, + "grad_norm": 0.004005473107099533, + "learning_rate": 1.028556004309178e-08, + "loss": 0.0, + "step": 3314 + }, + { + "epoch": 2.9408113500332522, + "grad_norm": 0.0023341896012425423, + "learning_rate": 9.980921108368457e-09, + "loss": 0.0, + "step": 3315 + }, + { + "epoch": 2.9416980713810688, + "grad_norm": 0.0034011153038591146, + "learning_rate": 9.68085715636058e-09, + "loss": 0.0, + "step": 3316 + }, + { + "epoch": 2.942584792728885, + "grad_norm": 0.0011600746074691415, + "learning_rate": 9.385368462173083e-09, + "loss": 0.0, + "step": 3317 + }, + { + "epoch": 2.9434715140767014, + "grad_norm": 0.0015895559918135405, + "learning_rate": 9.094455296719817e-09, + "loss": 0.0, + "step": 3318 + }, + { + "epoch": 2.944358235424518, + "grad_norm": 0.014281976036727428, + "learning_rate": 8.80811792671632e-09, + "loss": 0.0, + "step": 3319 + }, + { + "epoch": 2.9452449567723344, + "grad_norm": 0.2445225864648819, + "learning_rate": 8.526356614684816e-09, + "loss": 0.001, + "step": 3320 + }, + { + "epoch": 2.9461316781201505, + "grad_norm": 0.011881627142429352, + "learning_rate": 8.249171618952002e-09, + "loss": 0.0, + "step": 3321 + }, + { + "epoch": 2.947018399467967, + "grad_norm": 0.1268923282623291, + "learning_rate": 7.976563193647924e-09, + "loss": 0.0003, + "step": 3322 + }, + { + "epoch": 2.9479051208157836, + "grad_norm": 0.013145245611667633, + "learning_rate": 7.708531588707103e-09, + "loss": 0.0, + "step": 3323 + }, + { + "epoch": 2.9487918421636, + "grad_norm": 0.0036690891720354557, + "learning_rate": 7.445077049868521e-09, + "loss": 0.0, + "step": 3324 + }, + { + "epoch": 2.9496785635114167, + "grad_norm": 0.0030290221329778433, + "learning_rate": 7.186199818673967e-09, + "loss": 0.0, + "step": 3325 + }, + { + "epoch": 2.950565284859233, + "grad_norm": 0.006416503805667162, + "learning_rate": 6.931900132469694e-09, + "loss": 0.0, + "step": 3326 + }, + { + "epoch": 2.9514520062070493, + "grad_norm": 0.059920359402894974, + "learning_rate": 6.682178224403646e-09, + "loss": 0.0003, + "step": 3327 + }, + { + "epoch": 2.952338727554866, + "grad_norm": 0.042692240327596664, + "learning_rate": 6.4370343234282375e-09, + "loss": 0.0001, + "step": 3328 + }, + { + "epoch": 2.9532254489026823, + "grad_norm": 0.01230137050151825, + "learning_rate": 6.1964686542975716e-09, + "loss": 0.0, + "step": 3329 + }, + { + "epoch": 2.954112170250499, + "grad_norm": 0.0041125863790512085, + "learning_rate": 5.9604814375685546e-09, + "loss": 0.0, + "step": 3330 + }, + { + "epoch": 2.954998891598315, + "grad_norm": 0.01572389528155327, + "learning_rate": 5.72907288960034e-09, + "loss": 0.0, + "step": 3331 + }, + { + "epoch": 2.9558856129461315, + "grad_norm": 0.0016322958981618285, + "learning_rate": 5.502243222555437e-09, + "loss": 0.0, + "step": 3332 + }, + { + "epoch": 2.956772334293948, + "grad_norm": 0.30885788798332214, + "learning_rate": 5.279992644396381e-09, + "loss": 0.0061, + "step": 3333 + }, + { + "epoch": 2.9576590556417646, + "grad_norm": 0.0038180218543857336, + "learning_rate": 5.0623213588885114e-09, + "loss": 0.0, + "step": 3334 + }, + { + "epoch": 2.958545776989581, + "grad_norm": 0.01314638089388609, + "learning_rate": 4.849229565599412e-09, + "loss": 0.0001, + "step": 3335 + }, + { + "epoch": 2.9594324983373976, + "grad_norm": 0.005611849948763847, + "learning_rate": 4.640717459896693e-09, + "loss": 0.0, + "step": 3336 + }, + { + "epoch": 2.960319219685214, + "grad_norm": 0.006579660344868898, + "learning_rate": 4.436785232950214e-09, + "loss": 0.0, + "step": 3337 + }, + { + "epoch": 2.9612059410330303, + "grad_norm": 0.00035080299130640924, + "learning_rate": 4.237433071729857e-09, + "loss": 0.0, + "step": 3338 + }, + { + "epoch": 2.962092662380847, + "grad_norm": 0.0008623858448117971, + "learning_rate": 4.042661159007755e-09, + "loss": 0.0, + "step": 3339 + }, + { + "epoch": 2.9629793837286633, + "grad_norm": 0.1388547718524933, + "learning_rate": 3.852469673355508e-09, + "loss": 0.0007, + "step": 3340 + }, + { + "epoch": 2.96386610507648, + "grad_norm": 0.06763661652803421, + "learning_rate": 3.666858789145855e-09, + "loss": 0.0003, + "step": 3341 + }, + { + "epoch": 2.964752826424296, + "grad_norm": 0.0008432602626271546, + "learning_rate": 3.4858286765515614e-09, + "loss": 0.0, + "step": 3342 + }, + { + "epoch": 2.9656395477721125, + "grad_norm": 0.004892279859632254, + "learning_rate": 3.309379501546528e-09, + "loss": 0.0, + "step": 3343 + }, + { + "epoch": 2.966526269119929, + "grad_norm": 0.003920522518455982, + "learning_rate": 3.1375114259035723e-09, + "loss": 0.0, + "step": 3344 + }, + { + "epoch": 2.9674129904677455, + "grad_norm": 0.005874779541045427, + "learning_rate": 2.970224607196093e-09, + "loss": 0.0, + "step": 3345 + }, + { + "epoch": 2.968299711815562, + "grad_norm": 0.05062325671315193, + "learning_rate": 2.8075191987969597e-09, + "loss": 0.0002, + "step": 3346 + }, + { + "epoch": 2.9691864331633786, + "grad_norm": 0.10245689749717712, + "learning_rate": 2.6493953498790692e-09, + "loss": 0.0002, + "step": 3347 + }, + { + "epoch": 2.9700731545111947, + "grad_norm": 0.01170055940747261, + "learning_rate": 2.4958532054142338e-09, + "loss": 0.0, + "step": 3348 + }, + { + "epoch": 2.9709598758590112, + "grad_norm": 0.001403836184181273, + "learning_rate": 2.346892906174847e-09, + "loss": 0.0, + "step": 3349 + }, + { + "epoch": 2.9718465972068278, + "grad_norm": 0.06057799234986305, + "learning_rate": 2.2025145887305533e-09, + "loss": 0.0001, + "step": 3350 + }, + { + "epoch": 2.9727333185546443, + "grad_norm": 0.04310649633407593, + "learning_rate": 2.062718385451579e-09, + "loss": 0.0001, + "step": 3351 + }, + { + "epoch": 2.9736200399024604, + "grad_norm": 0.3466622531414032, + "learning_rate": 1.9275044245076203e-09, + "loss": 0.0006, + "step": 3352 + }, + { + "epoch": 2.974506761250277, + "grad_norm": 0.0031994169112294912, + "learning_rate": 1.7968728298650705e-09, + "loss": 0.0, + "step": 3353 + }, + { + "epoch": 2.9753934825980934, + "grad_norm": 1.4261260032653809, + "learning_rate": 1.6708237212920142e-09, + "loss": 0.0008, + "step": 3354 + }, + { + "epoch": 2.97628020394591, + "grad_norm": 0.0007013491122052073, + "learning_rate": 1.5493572143521207e-09, + "loss": 0.0, + "step": 3355 + }, + { + "epoch": 2.9771669252937265, + "grad_norm": 0.1456507444381714, + "learning_rate": 1.4324734204101964e-09, + "loss": 0.0007, + "step": 3356 + }, + { + "epoch": 2.978053646641543, + "grad_norm": 0.006528434809297323, + "learning_rate": 1.3201724466277432e-09, + "loss": 0.0, + "step": 3357 + }, + { + "epoch": 2.9789403679893596, + "grad_norm": 0.00108535576146096, + "learning_rate": 1.2124543959662894e-09, + "loss": 0.0, + "step": 3358 + }, + { + "epoch": 2.9798270893371757, + "grad_norm": 0.015093772672116756, + "learning_rate": 1.1093193671835035e-09, + "loss": 0.0001, + "step": 3359 + }, + { + "epoch": 2.980713810684992, + "grad_norm": 0.009435775689780712, + "learning_rate": 1.0107674548365255e-09, + "loss": 0.0, + "step": 3360 + }, + { + "epoch": 2.9816005320328087, + "grad_norm": 0.001818368211388588, + "learning_rate": 9.167987492808561e-10, + "loss": 0.0, + "step": 3361 + }, + { + "epoch": 2.9824872533806253, + "grad_norm": 0.7512038946151733, + "learning_rate": 8.274133366698023e-10, + "loss": 0.0029, + "step": 3362 + }, + { + "epoch": 2.9833739747284413, + "grad_norm": 0.33148589730262756, + "learning_rate": 7.426112989533663e-10, + "loss": 0.0026, + "step": 3363 + }, + { + "epoch": 2.984260696076258, + "grad_norm": 0.08316826820373535, + "learning_rate": 6.623927138804665e-10, + "loss": 0.0004, + "step": 3364 + }, + { + "epoch": 2.9851474174240744, + "grad_norm": 0.023770712316036224, + "learning_rate": 5.867576549983822e-10, + "loss": 0.0001, + "step": 3365 + }, + { + "epoch": 2.986034138771891, + "grad_norm": 0.006840167101472616, + "learning_rate": 5.157061916505335e-10, + "loss": 0.0, + "step": 3366 + }, + { + "epoch": 2.9869208601197075, + "grad_norm": 0.005075496155768633, + "learning_rate": 4.492383889792562e-10, + "loss": 0.0, + "step": 3367 + }, + { + "epoch": 2.987807581467524, + "grad_norm": 0.03411711007356644, + "learning_rate": 3.873543079241371e-10, + "loss": 0.0001, + "step": 3368 + }, + { + "epoch": 2.9886943028153405, + "grad_norm": 0.38265281915664673, + "learning_rate": 3.300540052214585e-10, + "loss": 0.0014, + "step": 3369 + }, + { + "epoch": 2.9895810241631566, + "grad_norm": 0.0326530784368515, + "learning_rate": 2.773375334064188e-10, + "loss": 0.0001, + "step": 3370 + }, + { + "epoch": 2.990467745510973, + "grad_norm": 0.002753943670541048, + "learning_rate": 2.2920494081091204e-10, + "loss": 0.0, + "step": 3371 + }, + { + "epoch": 2.9913544668587897, + "grad_norm": 0.07752973586320877, + "learning_rate": 1.8565627156352794e-10, + "loss": 0.0003, + "step": 3372 + }, + { + "epoch": 2.9922411882066062, + "grad_norm": 0.0007558520883321762, + "learning_rate": 1.466915655912171e-10, + "loss": 0.0, + "step": 3373 + }, + { + "epoch": 2.9931279095544223, + "grad_norm": 0.052483346313238144, + "learning_rate": 1.123108586176258e-10, + "loss": 0.0002, + "step": 3374 + }, + { + "epoch": 2.994014630902239, + "grad_norm": 0.0038302731700241566, + "learning_rate": 8.251418216420615e-11, + "loss": 0.0, + "step": 3375 + }, + { + "epoch": 2.9949013522500554, + "grad_norm": 0.0008110279450193048, + "learning_rate": 5.730156354966099e-11, + "loss": 0.0, + "step": 3376 + }, + { + "epoch": 2.995788073597872, + "grad_norm": 0.00048128835624083877, + "learning_rate": 3.66730258888337e-11, + "loss": 0.0, + "step": 3377 + }, + { + "epoch": 2.9966747949456884, + "grad_norm": 0.004053472075611353, + "learning_rate": 2.0628588094928627e-11, + "loss": 0.0, + "step": 3378 + }, + { + "epoch": 2.997561516293505, + "grad_norm": 0.014079282991588116, + "learning_rate": 9.168264877845723e-12, + "loss": 0.0001, + "step": 3379 + }, + { + "epoch": 2.998448237641321, + "grad_norm": 0.0015199949266389012, + "learning_rate": 2.292066744735699e-12, + "loss": 0.0, + "step": 3380 + }, + { + "epoch": 2.9993349589891376, + "grad_norm": 0.001595963491126895, + "learning_rate": 0.0, + "loss": 0.0, + "step": 3381 + } + ], + "logging_steps": 1, + "max_steps": 3381, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 564, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.102510967617749e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}