diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7654 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999954795108831, + "eval_steps": 1000, + "global_step": 110607, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 0.00024521369940743435, + "loss": 5.4472, + "step": 100 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002927618508292659, + "loss": 3.467, + "step": 200 + }, + { + "epoch": 0.0, + "learning_rate": 0.0002997989257927486, + "loss": 3.3454, + "step": 300 + }, + { + "epoch": 0.0, + "learning_rate": 0.00029952720389105764, + "loss": 3.2872, + "step": 400 + }, + { + "epoch": 0.0, + "learning_rate": 0.00029925548198936656, + "loss": 3.2489, + "step": 500 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002989837600876756, + "loss": 3.2458, + "step": 600 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029871475540500146, + "loss": 3.2073, + "step": 700 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029844303350331044, + "loss": 3.1957, + "step": 800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002981713116016194, + "loss": 3.1817, + "step": 900 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029789958969992844, + "loss": 3.1631, + "step": 1000 + }, + { + "epoch": 0.01, + "eval_accuracy": 0.4182117332510669, + "eval_loss": 3.180420398712158, + "eval_runtime": 43.6723, + "eval_samples_per_second": 148.447, + "eval_steps_per_second": 2.496, + "step": 1000 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002976278677982374, + "loss": 3.1598, + "step": 1100 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002973561458965464, + "loss": 3.1584, + "step": 1200 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029708442399485537, + "loss": 3.144, + "step": 1300 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029681270209316434, + "loss": 3.1346, + "step": 1400 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002965409801914733, + "loss": 3.1359, + "step": 1500 + }, + { + "epoch": 0.01, + "learning_rate": 0.00029626925828978235, + "loss": 3.1268, + "step": 1600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002959975363880913, + "loss": 3.1175, + "step": 1700 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002957258144864003, + "loss": 3.1189, + "step": 1800 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002954540925847093, + "loss": 3.1057, + "step": 1900 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029518237068301825, + "loss": 3.1124, + "step": 2000 + }, + { + "epoch": 0.02, + "eval_accuracy": 0.4272265623818554, + "eval_loss": 3.106520891189575, + "eval_runtime": 43.4484, + "eval_samples_per_second": 149.212, + "eval_steps_per_second": 2.509, + "step": 2000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002949106487813272, + "loss": 3.1004, + "step": 2100 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002946389268796362, + "loss": 3.1018, + "step": 2200 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029436720497794523, + "loss": 3.0864, + "step": 2300 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002940954830762542, + "loss": 3.0872, + "step": 2400 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002938237611745632, + "loss": 3.0883, + "step": 2500 + }, + { + "epoch": 0.02, + "learning_rate": 0.00029355475649188906, + "loss": 3.0843, + "step": 2600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002932830345901981, + "loss": 3.0815, + "step": 2700 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029301131268850706, + "loss": 3.0784, + "step": 2800 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029273959078681604, + "loss": 3.0728, + "step": 2900 + }, + { + "epoch": 0.03, + "learning_rate": 0.000292467868885125, + "loss": 3.0757, + "step": 3000 + }, + { + "epoch": 0.03, + "eval_accuracy": 0.42875514543315396, + "eval_loss": 3.0894298553466797, + "eval_runtime": 43.8742, + "eval_samples_per_second": 147.763, + "eval_steps_per_second": 2.484, + "step": 3000 + }, + { + "epoch": 0.03, + "learning_rate": 0.000292196146983434, + "loss": 3.0754, + "step": 3100 + }, + { + "epoch": 0.03, + "learning_rate": 0.000291924425081743, + "loss": 3.0634, + "step": 3200 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029165270318005194, + "loss": 3.0652, + "step": 3300 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029138098127836097, + "loss": 3.0566, + "step": 3400 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029110925937666994, + "loss": 3.067, + "step": 3500 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002908375374749789, + "loss": 3.0525, + "step": 3600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002905658155732879, + "loss": 3.0595, + "step": 3700 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029029409367159687, + "loss": 3.0586, + "step": 3800 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002900223717699059, + "loss": 3.0499, + "step": 3900 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002897506498682148, + "loss": 3.0488, + "step": 4000 + }, + { + "epoch": 0.04, + "eval_accuracy": 0.43185525945686004, + "eval_loss": 3.062988758087158, + "eval_runtime": 44.0507, + "eval_samples_per_second": 147.171, + "eval_steps_per_second": 2.474, + "step": 4000 + }, + { + "epoch": 0.04, + "learning_rate": 0.00028947892796652385, + "loss": 3.0426, + "step": 4100 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002892072060648328, + "loss": 3.0433, + "step": 4200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002889354841631418, + "loss": 3.0428, + "step": 4300 + }, + { + "epoch": 0.04, + "learning_rate": 0.00028866376226145083, + "loss": 3.0359, + "step": 4400 + }, + { + "epoch": 0.04, + "learning_rate": 0.00028839204035975975, + "loss": 3.0386, + "step": 4500 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002881203184580688, + "loss": 3.034, + "step": 4600 + }, + { + "epoch": 0.04, + "learning_rate": 0.00028784859655637776, + "loss": 3.0456, + "step": 4700 + }, + { + "epoch": 0.04, + "learning_rate": 0.00028757687465468673, + "loss": 3.0361, + "step": 4800 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002873051527529957, + "loss": 3.0412, + "step": 4900 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002870334308513047, + "loss": 3.0403, + "step": 5000 + }, + { + "epoch": 0.05, + "eval_accuracy": 0.43361975362410893, + "eval_loss": 3.0423271656036377, + "eval_runtime": 43.1088, + "eval_samples_per_second": 150.387, + "eval_steps_per_second": 2.528, + "step": 5000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002867617089496137, + "loss": 3.0341, + "step": 5100 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028648998704792263, + "loss": 3.0297, + "step": 5200 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028621826514623166, + "loss": 3.0324, + "step": 5300 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028594654324454064, + "loss": 3.0317, + "step": 5400 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002856748213428496, + "loss": 3.0167, + "step": 5500 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002854030994411586, + "loss": 3.0202, + "step": 5600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028513137753946756, + "loss": 3.0231, + "step": 5700 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002848596556377766, + "loss": 3.0166, + "step": 5800 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002845879337360855, + "loss": 3.0246, + "step": 5900 + }, + { + "epoch": 0.05, + "learning_rate": 0.00028431621183439454, + "loss": 3.0172, + "step": 6000 + }, + { + "epoch": 0.05, + "eval_accuracy": 0.434315993866311, + "eval_loss": 3.038356304168701, + "eval_runtime": 43.5763, + "eval_samples_per_second": 148.774, + "eval_steps_per_second": 2.501, + "step": 6000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002840444899327035, + "loss": 3.0123, + "step": 6100 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002837727680310125, + "loss": 3.0177, + "step": 6200 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002835010461293215, + "loss": 3.0195, + "step": 6300 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002832320414466474, + "loss": 3.0175, + "step": 6400 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002829603195449564, + "loss": 3.0192, + "step": 6500 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028268859764326535, + "loss": 3.0079, + "step": 6600 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002824168757415743, + "loss": 3.0138, + "step": 6700 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002821451538398833, + "loss": 3.0175, + "step": 6800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028187343193819233, + "loss": 3.0148, + "step": 6900 + }, + { + "epoch": 0.06, + "learning_rate": 0.00028160171003650125, + "loss": 3.0102, + "step": 7000 + }, + { + "epoch": 0.06, + "eval_accuracy": 0.43602120780442366, + "eval_loss": 3.026742696762085, + "eval_runtime": 43.2189, + "eval_samples_per_second": 150.004, + "eval_steps_per_second": 2.522, + "step": 7000 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002813299881348103, + "loss": 3.0111, + "step": 7100 + }, + { + "epoch": 0.07, + "learning_rate": 0.00028105826623311925, + "loss": 3.0077, + "step": 7200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00028078654433142823, + "loss": 3.0055, + "step": 7300 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002805148224297372, + "loss": 3.0084, + "step": 7400 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002802431005280462, + "loss": 3.0124, + "step": 7500 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002799713786263552, + "loss": 3.0051, + "step": 7600 + }, + { + "epoch": 0.07, + "learning_rate": 0.00027969965672466413, + "loss": 3.0039, + "step": 7700 + }, + { + "epoch": 0.07, + "learning_rate": 0.00027942793482297316, + "loss": 3.0033, + "step": 7800 + }, + { + "epoch": 0.07, + "learning_rate": 0.00027915621292128214, + "loss": 3.0044, + "step": 7900 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002788844910195911, + "loss": 2.9888, + "step": 8000 + }, + { + "epoch": 0.07, + "eval_accuracy": 0.4361119428490199, + "eval_loss": 3.0189716815948486, + "eval_runtime": 43.5746, + "eval_samples_per_second": 148.779, + "eval_steps_per_second": 2.501, + "step": 8000 + }, + { + "epoch": 0.07, + "learning_rate": 0.00027861276911790014, + "loss": 3.0097, + "step": 8100 + }, + { + "epoch": 0.07, + "learning_rate": 0.00027834104721620906, + "loss": 3.008, + "step": 8200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002780693253145181, + "loss": 2.9979, + "step": 8300 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027779760341282707, + "loss": 2.994, + "step": 8400 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027752588151113604, + "loss": 2.9985, + "step": 8500 + }, + { + "epoch": 0.08, + "learning_rate": 0.000277254159609445, + "loss": 2.9966, + "step": 8600 + }, + { + "epoch": 0.08, + "learning_rate": 0.000276982437707754, + "loss": 2.9968, + "step": 8700 + }, + { + "epoch": 0.08, + "learning_rate": 0.000276710715806063, + "loss": 2.9999, + "step": 8800 + }, + { + "epoch": 0.08, + "learning_rate": 0.000276438993904372, + "loss": 2.9973, + "step": 8900 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027616727200268097, + "loss": 3.0024, + "step": 9000 + }, + { + "epoch": 0.08, + "eval_accuracy": 0.4384970647213073, + "eval_loss": 3.0039989948272705, + "eval_runtime": 44.0911, + "eval_samples_per_second": 147.036, + "eval_steps_per_second": 2.472, + "step": 9000 + }, + { + "epoch": 0.08, + "learning_rate": 0.00027589555010098995, + "loss": 3.0015, + "step": 9100 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002756238281992989, + "loss": 2.9946, + "step": 9200 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002753521062976079, + "loss": 2.9932, + "step": 9300 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002750803843959169, + "loss": 2.9985, + "step": 9400 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002748086624942259, + "loss": 2.9913, + "step": 9500 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002745369405925349, + "loss": 2.9946, + "step": 9600 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027426521869084385, + "loss": 2.99, + "step": 9700 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027399349678915283, + "loss": 2.9927, + "step": 9800 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002737217748874618, + "loss": 2.9883, + "step": 9900 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027345005298577083, + "loss": 2.9948, + "step": 10000 + }, + { + "epoch": 0.09, + "eval_accuracy": 0.43782985969337607, + "eval_loss": 3.0057804584503174, + "eval_runtime": 43.6419, + "eval_samples_per_second": 148.55, + "eval_steps_per_second": 2.498, + "step": 10000 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002731810483030967, + "loss": 2.983, + "step": 10100 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002729093264014057, + "loss": 2.9806, + "step": 10200 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027263760449971466, + "loss": 2.9881, + "step": 10300 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027236588259802364, + "loss": 2.9814, + "step": 10400 + }, + { + "epoch": 0.09, + "learning_rate": 0.00027209416069633267, + "loss": 2.9824, + "step": 10500 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027182243879464164, + "loss": 2.9885, + "step": 10600 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002715507168929506, + "loss": 2.989, + "step": 10700 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002712789949912596, + "loss": 2.986, + "step": 10800 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027100727308956857, + "loss": 2.9856, + "step": 10900 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027073555118787754, + "loss": 2.9774, + "step": 11000 + }, + { + "epoch": 0.1, + "eval_accuracy": 0.438878756808909, + "eval_loss": 2.9962034225463867, + "eval_runtime": 43.3441, + "eval_samples_per_second": 149.57, + "eval_steps_per_second": 2.515, + "step": 11000 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002704638292861865, + "loss": 2.9941, + "step": 11100 + }, + { + "epoch": 0.1, + "learning_rate": 0.00027019210738449555, + "loss": 2.9799, + "step": 11200 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002699203854828045, + "loss": 2.9834, + "step": 11300 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002696513808001304, + "loss": 2.9767, + "step": 11400 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002693796588984394, + "loss": 2.9772, + "step": 11500 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002691079369967484, + "loss": 2.9891, + "step": 11600 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002688362150950573, + "loss": 2.9787, + "step": 11700 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026856449319336635, + "loss": 2.987, + "step": 11800 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026829277129167533, + "loss": 2.979, + "step": 11900 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002680210493899843, + "loss": 2.9818, + "step": 12000 + }, + { + "epoch": 0.11, + "eval_accuracy": 0.4390166740766953, + "eval_loss": 2.9964208602905273, + "eval_runtime": 44.0294, + "eval_samples_per_second": 147.242, + "eval_steps_per_second": 2.476, + "step": 12000 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002677493274882933, + "loss": 2.9798, + "step": 12100 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026747760558660225, + "loss": 2.9833, + "step": 12200 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002672058836849113, + "loss": 2.9787, + "step": 12300 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026693416178322026, + "loss": 2.9807, + "step": 12400 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026666515710054614, + "loss": 2.9846, + "step": 12500 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002663934351988551, + "loss": 2.9758, + "step": 12600 + }, + { + "epoch": 0.11, + "learning_rate": 0.00026612171329716414, + "loss": 2.9749, + "step": 12700 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026584999139547306, + "loss": 2.9688, + "step": 12800 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002655782694937821, + "loss": 2.9886, + "step": 12900 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026530654759209107, + "loss": 2.9771, + "step": 13000 + }, + { + "epoch": 0.12, + "eval_accuracy": 0.4395580598427864, + "eval_loss": 2.991270065307617, + "eval_runtime": 43.0298, + "eval_samples_per_second": 150.663, + "eval_steps_per_second": 2.533, + "step": 13000 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026503482569040004, + "loss": 2.9802, + "step": 13100 + }, + { + "epoch": 0.12, + "learning_rate": 0.000264763103788709, + "loss": 2.9711, + "step": 13200 + }, + { + "epoch": 0.12, + "learning_rate": 0.000264491381887018, + "loss": 2.9845, + "step": 13300 + }, + { + "epoch": 0.12, + "learning_rate": 0.000264219659985327, + "loss": 2.9735, + "step": 13400 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026394793808363594, + "loss": 2.9731, + "step": 13500 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026367621618194497, + "loss": 2.9717, + "step": 13600 + }, + { + "epoch": 0.12, + "learning_rate": 0.00026340449428025395, + "loss": 2.9718, + "step": 13700 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002631327723785629, + "loss": 2.9766, + "step": 13800 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002628637676958888, + "loss": 2.9812, + "step": 13900 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026259204579419783, + "loss": 2.9786, + "step": 14000 + }, + { + "epoch": 0.13, + "eval_accuracy": 0.43911829732664315, + "eval_loss": 2.9915201663970947, + "eval_runtime": 43.7467, + "eval_samples_per_second": 148.194, + "eval_steps_per_second": 2.492, + "step": 14000 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002623203238925068, + "loss": 2.9757, + "step": 14100 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002620486019908158, + "loss": 2.9781, + "step": 14200 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026177959730814166, + "loss": 2.9733, + "step": 14300 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002615078754064507, + "loss": 2.9773, + "step": 14400 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002612361535047596, + "loss": 2.9755, + "step": 14500 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026096443160306864, + "loss": 2.9837, + "step": 14600 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002606927097013776, + "loss": 2.9786, + "step": 14700 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002604209877996866, + "loss": 2.9709, + "step": 14800 + }, + { + "epoch": 0.13, + "learning_rate": 0.00026014926589799556, + "loss": 2.9797, + "step": 14900 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025987754399630454, + "loss": 2.9866, + "step": 15000 + }, + { + "epoch": 0.14, + "eval_accuracy": 0.4393814289559723, + "eval_loss": 2.9924139976501465, + "eval_runtime": 43.2705, + "eval_samples_per_second": 149.825, + "eval_steps_per_second": 2.519, + "step": 15000 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025960582209461357, + "loss": 2.976, + "step": 15100 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002593341001929225, + "loss": 2.9674, + "step": 15200 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002590623782912315, + "loss": 2.98, + "step": 15300 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002587906563895405, + "loss": 2.9805, + "step": 15400 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025851893448784947, + "loss": 2.9738, + "step": 15500 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025824721258615844, + "loss": 2.9702, + "step": 15600 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002579754906844674, + "loss": 2.9678, + "step": 15700 + }, + { + "epoch": 0.14, + "learning_rate": 0.00025770376878277645, + "loss": 2.9699, + "step": 15800 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002574320468810854, + "loss": 2.9717, + "step": 15900 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002571603249793944, + "loss": 2.9751, + "step": 16000 + }, + { + "epoch": 0.14, + "eval_accuracy": 0.43892109982972055, + "eval_loss": 2.9917728900909424, + "eval_runtime": 44.2385, + "eval_samples_per_second": 146.547, + "eval_steps_per_second": 2.464, + "step": 16000 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002568886030777034, + "loss": 2.9653, + "step": 16100 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025661688117601235, + "loss": 2.9817, + "step": 16200 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002563451592743214, + "loss": 2.9652, + "step": 16300 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025607343737263035, + "loss": 2.9704, + "step": 16400 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025580171547093933, + "loss": 2.9727, + "step": 16500 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002555299935692483, + "loss": 2.9743, + "step": 16600 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002552582716675573, + "loss": 2.9719, + "step": 16700 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025498654976586626, + "loss": 2.9615, + "step": 16800 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025471482786417523, + "loss": 2.973, + "step": 16900 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025444310596248426, + "loss": 2.9702, + "step": 17000 + }, + { + "epoch": 0.15, + "eval_accuracy": 0.4393215438265388, + "eval_loss": 2.992605447769165, + "eval_runtime": 45.7096, + "eval_samples_per_second": 141.83, + "eval_steps_per_second": 2.385, + "step": 17000 + }, + { + "epoch": 0.15, + "learning_rate": 0.00025417410127981014, + "loss": 2.9689, + "step": 17100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002539023793781191, + "loss": 2.9727, + "step": 17200 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002536306574764281, + "loss": 2.9669, + "step": 17300 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025335893557473706, + "loss": 2.9717, + "step": 17400 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002530872136730461, + "loss": 2.9646, + "step": 17500 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025281549177135507, + "loss": 2.9757, + "step": 17600 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025254376986966404, + "loss": 2.9679, + "step": 17700 + }, + { + "epoch": 0.16, + "learning_rate": 0.000252272047967973, + "loss": 2.9691, + "step": 17800 + }, + { + "epoch": 0.16, + "learning_rate": 0.000252000326066282, + "loss": 2.9718, + "step": 17900 + }, + { + "epoch": 0.16, + "learning_rate": 0.00025172860416459097, + "loss": 2.9695, + "step": 18000 + }, + { + "epoch": 0.16, + "eval_accuracy": 0.44013513472641874, + "eval_loss": 2.981644868850708, + "eval_runtime": 43.6409, + "eval_samples_per_second": 148.553, + "eval_steps_per_second": 2.498, + "step": 18000 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002514568822629, + "loss": 2.9666, + "step": 18100 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002511878775802259, + "loss": 2.9696, + "step": 18200 + }, + { + "epoch": 0.17, + "learning_rate": 0.00025091615567853485, + "loss": 2.9687, + "step": 18300 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002506444337768438, + "loss": 2.9674, + "step": 18400 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002503727118751528, + "loss": 2.9655, + "step": 18500 + }, + { + "epoch": 0.17, + "learning_rate": 0.00025010098997346183, + "loss": 2.9661, + "step": 18600 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002498292680717708, + "loss": 2.9673, + "step": 18700 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002495575461700798, + "loss": 2.9641, + "step": 18800 + }, + { + "epoch": 0.17, + "learning_rate": 0.00024928582426838876, + "loss": 2.9598, + "step": 18900 + }, + { + "epoch": 0.17, + "learning_rate": 0.00024901410236669773, + "loss": 2.9615, + "step": 19000 + }, + { + "epoch": 0.17, + "eval_accuracy": 0.44022042566833924, + "eval_loss": 2.982591390609741, + "eval_runtime": 43.6998, + "eval_samples_per_second": 148.353, + "eval_steps_per_second": 2.494, + "step": 19000 + }, + { + "epoch": 0.17, + "learning_rate": 0.00024874238046500676, + "loss": 2.958, + "step": 19100 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002484706585633157, + "loss": 2.9688, + "step": 19200 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002481989366616247, + "loss": 2.9603, + "step": 19300 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002479272147599337, + "loss": 2.9625, + "step": 19400 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024765549285824266, + "loss": 2.9611, + "step": 19500 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024738377095655164, + "loss": 2.9594, + "step": 19600 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002471120490548606, + "loss": 2.9648, + "step": 19700 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024684032715316964, + "loss": 2.961, + "step": 19800 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002465686052514786, + "loss": 2.9589, + "step": 19900 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002462968833497876, + "loss": 2.9609, + "step": 20000 + }, + { + "epoch": 0.18, + "eval_accuracy": 0.4406414362752659, + "eval_loss": 2.9791083335876465, + "eval_runtime": 44.2323, + "eval_samples_per_second": 146.567, + "eval_steps_per_second": 2.464, + "step": 20000 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024602516144809657, + "loss": 2.962, + "step": 20100 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024575343954640554, + "loss": 2.9566, + "step": 20200 + }, + { + "epoch": 0.18, + "learning_rate": 0.00024548171764471457, + "loss": 2.964, + "step": 20300 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002452099957430235, + "loss": 2.9573, + "step": 20400 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002449409910603494, + "loss": 2.9621, + "step": 20500 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002446692691586584, + "loss": 2.9568, + "step": 20600 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002443975472569674, + "loss": 2.9643, + "step": 20700 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024412582535527635, + "loss": 2.9614, + "step": 20800 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024385410345358535, + "loss": 2.9546, + "step": 20900 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024358238155189433, + "loss": 2.9607, + "step": 21000 + }, + { + "epoch": 0.19, + "eval_accuracy": 0.44158508073906716, + "eval_loss": 2.9684245586395264, + "eval_runtime": 43.092, + "eval_samples_per_second": 150.446, + "eval_steps_per_second": 2.529, + "step": 21000 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024331065965020333, + "loss": 2.9608, + "step": 21100 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024303893774851228, + "loss": 2.9556, + "step": 21200 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024276721584682128, + "loss": 2.9579, + "step": 21300 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024249549394513028, + "loss": 2.9585, + "step": 21400 + }, + { + "epoch": 0.19, + "learning_rate": 0.00024222377204343926, + "loss": 2.9544, + "step": 21500 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024195205014174823, + "loss": 2.9614, + "step": 21600 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002416803282400572, + "loss": 2.9536, + "step": 21700 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024141132355738314, + "loss": 2.9556, + "step": 21800 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002411396016556921, + "loss": 2.9559, + "step": 21900 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002408678797540011, + "loss": 2.9533, + "step": 22000 + }, + { + "epoch": 0.2, + "eval_accuracy": 0.4422038937432138, + "eval_loss": 2.967719554901123, + "eval_runtime": 42.9224, + "eval_samples_per_second": 151.04, + "eval_steps_per_second": 2.539, + "step": 22000 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024059615785231007, + "loss": 2.9493, + "step": 22100 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024032443595061907, + "loss": 2.9543, + "step": 22200 + }, + { + "epoch": 0.2, + "learning_rate": 0.00024005271404892804, + "loss": 2.9565, + "step": 22300 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023978099214723702, + "loss": 2.9501, + "step": 22400 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023950927024554602, + "loss": 2.9395, + "step": 22500 + }, + { + "epoch": 0.2, + "learning_rate": 0.00023923754834385497, + "loss": 2.9598, + "step": 22600 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023896854366118087, + "loss": 2.9492, + "step": 22700 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023869682175948988, + "loss": 2.947, + "step": 22800 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023842509985779885, + "loss": 2.9573, + "step": 22900 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023815337795610783, + "loss": 2.9513, + "step": 23000 + }, + { + "epoch": 0.21, + "eval_accuracy": 0.4420853332849413, + "eval_loss": 2.9676427841186523, + "eval_runtime": 43.0836, + "eval_samples_per_second": 150.475, + "eval_steps_per_second": 2.53, + "step": 23000 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023788165605441683, + "loss": 2.9472, + "step": 23100 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023760993415272578, + "loss": 2.9513, + "step": 23200 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023733821225103478, + "loss": 2.9542, + "step": 23300 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023706649034934378, + "loss": 2.9497, + "step": 23400 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023679476844765276, + "loss": 2.9565, + "step": 23500 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023652576376497863, + "loss": 2.9518, + "step": 23600 + }, + { + "epoch": 0.21, + "learning_rate": 0.00023625404186328764, + "loss": 2.9471, + "step": 23700 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023598231996159664, + "loss": 2.956, + "step": 23800 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023571059805990559, + "loss": 2.953, + "step": 23900 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002354388761582146, + "loss": 2.9563, + "step": 24000 + }, + { + "epoch": 0.22, + "eval_accuracy": 0.4428523468619285, + "eval_loss": 2.9609880447387695, + "eval_runtime": 42.9764, + "eval_samples_per_second": 150.85, + "eval_steps_per_second": 2.536, + "step": 24000 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023516715425652356, + "loss": 2.9458, + "step": 24100 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023489543235483257, + "loss": 2.9539, + "step": 24200 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023462371045314151, + "loss": 2.9549, + "step": 24300 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023435198855145052, + "loss": 2.9496, + "step": 24400 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023408026664975952, + "loss": 2.9514, + "step": 24500 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002338085447480685, + "loss": 2.9471, + "step": 24600 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023353682284637747, + "loss": 2.9448, + "step": 24700 + }, + { + "epoch": 0.22, + "learning_rate": 0.00023326510094468644, + "loss": 2.948, + "step": 24800 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023299337904299545, + "loss": 2.9454, + "step": 24900 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023272165714130445, + "loss": 2.9466, + "step": 25000 + }, + { + "epoch": 0.23, + "eval_accuracy": 0.44241621374756906, + "eval_loss": 2.9626522064208984, + "eval_runtime": 43.5013, + "eval_samples_per_second": 149.03, + "eval_steps_per_second": 2.506, + "step": 25000 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002324499352396134, + "loss": 2.9417, + "step": 25100 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002321782133379224, + "loss": 2.9452, + "step": 25200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023190649143623138, + "loss": 2.9406, + "step": 25300 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023163476953454038, + "loss": 2.945, + "step": 25400 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023136304763284933, + "loss": 2.9419, + "step": 25500 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023109132573115833, + "loss": 2.9452, + "step": 25600 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023081960382946733, + "loss": 2.9435, + "step": 25700 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023054788192777628, + "loss": 2.947, + "step": 25800 + }, + { + "epoch": 0.23, + "learning_rate": 0.00023027616002608528, + "loss": 2.9343, + "step": 25900 + }, + { + "epoch": 0.24, + "learning_rate": 0.00023000443812439426, + "loss": 2.9431, + "step": 26000 + }, + { + "epoch": 0.24, + "eval_accuracy": 0.442384758932109, + "eval_loss": 2.9589717388153076, + "eval_runtime": 43.1206, + "eval_samples_per_second": 150.346, + "eval_steps_per_second": 2.528, + "step": 26000 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022973271622270326, + "loss": 2.9431, + "step": 26100 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022946099432101226, + "loss": 2.9477, + "step": 26200 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002291892724193212, + "loss": 2.939, + "step": 26300 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002289175505176302, + "loss": 2.9385, + "step": 26400 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002286458286159392, + "loss": 2.944, + "step": 26500 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022837410671424816, + "loss": 2.9404, + "step": 26600 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022810238481255716, + "loss": 2.9334, + "step": 26700 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022783066291086614, + "loss": 2.9419, + "step": 26800 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022755894100917514, + "loss": 2.9432, + "step": 26900 + }, + { + "epoch": 0.24, + "learning_rate": 0.00022728993632650102, + "loss": 2.9412, + "step": 27000 + }, + { + "epoch": 0.24, + "eval_accuracy": 0.4435655243124552, + "eval_loss": 2.952514410018921, + "eval_runtime": 43.0804, + "eval_samples_per_second": 150.486, + "eval_steps_per_second": 2.53, + "step": 27000 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022701821442481, + "loss": 2.9359, + "step": 27100 + }, + { + "epoch": 0.25, + "learning_rate": 0.000226746492523119, + "loss": 2.9426, + "step": 27200 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022647477062142797, + "loss": 2.9307, + "step": 27300 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022620304871973695, + "loss": 2.9353, + "step": 27400 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022593132681804595, + "loss": 2.9353, + "step": 27500 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002256596049163549, + "loss": 2.9403, + "step": 27600 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002253878830146639, + "loss": 2.9393, + "step": 27700 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002251161611129729, + "loss": 2.9313, + "step": 27800 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022484443921128188, + "loss": 2.9348, + "step": 27900 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022457543452860775, + "loss": 2.9299, + "step": 28000 + }, + { + "epoch": 0.25, + "eval_accuracy": 0.4434947509776701, + "eval_loss": 2.9504144191741943, + "eval_runtime": 43.7459, + "eval_samples_per_second": 148.197, + "eval_steps_per_second": 2.492, + "step": 28000 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022430371262691676, + "loss": 2.938, + "step": 28100 + }, + { + "epoch": 0.25, + "learning_rate": 0.00022403199072522576, + "loss": 2.9353, + "step": 28200 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022376298604255164, + "loss": 2.9329, + "step": 28300 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002234912641408606, + "loss": 2.9311, + "step": 28400 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002232195422391696, + "loss": 2.9377, + "step": 28500 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022294782033747856, + "loss": 2.9303, + "step": 28600 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022267609843578756, + "loss": 2.9278, + "step": 28700 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022240437653409657, + "loss": 2.9394, + "step": 28800 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022213265463240551, + "loss": 2.9332, + "step": 28900 + }, + { + "epoch": 0.26, + "learning_rate": 0.00022186093273071452, + "loss": 2.9332, + "step": 29000 + }, + { + "epoch": 0.26, + "eval_accuracy": 0.4434814431711293, + "eval_loss": 2.9485716819763184, + "eval_runtime": 41.8653, + "eval_samples_per_second": 154.854, + "eval_steps_per_second": 2.604, + "step": 29000 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002215892108290235, + "loss": 2.9339, + "step": 29100 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002213174889273325, + "loss": 2.9322, + "step": 29200 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002210457670256415, + "loss": 2.9305, + "step": 29300 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022077404512395044, + "loss": 2.9321, + "step": 29400 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022050232322225945, + "loss": 2.9265, + "step": 29500 + }, + { + "epoch": 0.27, + "learning_rate": 0.00022023331853958532, + "loss": 2.9247, + "step": 29600 + }, + { + "epoch": 0.27, + "learning_rate": 0.00021996159663789433, + "loss": 2.9312, + "step": 29700 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002196898747362033, + "loss": 2.9288, + "step": 29800 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002194181528345123, + "loss": 2.9328, + "step": 29900 + }, + { + "epoch": 0.27, + "learning_rate": 0.00021914643093282125, + "loss": 2.9255, + "step": 30000 + }, + { + "epoch": 0.27, + "eval_accuracy": 0.444235753841873, + "eval_loss": 2.942479372024536, + "eval_runtime": 41.7184, + "eval_samples_per_second": 155.399, + "eval_steps_per_second": 2.613, + "step": 30000 + }, + { + "epoch": 0.27, + "learning_rate": 0.00021887470903113025, + "loss": 2.9265, + "step": 30100 + }, + { + "epoch": 0.27, + "learning_rate": 0.00021860298712943923, + "loss": 2.9184, + "step": 30200 + }, + { + "epoch": 0.27, + "learning_rate": 0.00021833126522774823, + "loss": 2.9271, + "step": 30300 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002180622605450741, + "loss": 2.9232, + "step": 30400 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002177905386433831, + "loss": 2.9303, + "step": 30500 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021751881674169206, + "loss": 2.9348, + "step": 30600 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021724709484000106, + "loss": 2.9218, + "step": 30700 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021697537293831006, + "loss": 2.9324, + "step": 30800 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021670365103661904, + "loss": 2.9294, + "step": 30900 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021643192913492801, + "loss": 2.9242, + "step": 31000 + }, + { + "epoch": 0.28, + "eval_accuracy": 0.44344756875448005, + "eval_loss": 2.945934534072876, + "eval_runtime": 43.5276, + "eval_samples_per_second": 148.94, + "eval_steps_per_second": 2.504, + "step": 31000 + }, + { + "epoch": 0.28, + "learning_rate": 0.000216160207233237, + "loss": 2.9231, + "step": 31100 + }, + { + "epoch": 0.28, + "learning_rate": 0.000215888485331546, + "loss": 2.9269, + "step": 31200 + }, + { + "epoch": 0.28, + "learning_rate": 0.000215616763429855, + "loss": 2.9247, + "step": 31300 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021534504152816394, + "loss": 2.9236, + "step": 31400 + }, + { + "epoch": 0.28, + "learning_rate": 0.00021507331962647295, + "loss": 2.9296, + "step": 31500 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021480159772478192, + "loss": 2.9267, + "step": 31600 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021452987582309092, + "loss": 2.9259, + "step": 31700 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021425815392139987, + "loss": 2.9259, + "step": 31800 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021398643201970887, + "loss": 2.9236, + "step": 31900 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021371471011801788, + "loss": 2.9242, + "step": 32000 + }, + { + "epoch": 0.29, + "eval_accuracy": 0.4445194520813107, + "eval_loss": 2.9377670288085938, + "eval_runtime": 43.8729, + "eval_samples_per_second": 147.768, + "eval_steps_per_second": 2.484, + "step": 32000 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021344298821632685, + "loss": 2.9178, + "step": 32100 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021317126631463583, + "loss": 2.9257, + "step": 32200 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002128995444129448, + "loss": 2.9227, + "step": 32300 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002126278225112538, + "loss": 2.9228, + "step": 32400 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002123561006095628, + "loss": 2.9183, + "step": 32500 + }, + { + "epoch": 0.29, + "learning_rate": 0.00021208437870787175, + "loss": 2.9196, + "step": 32600 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021181265680618076, + "loss": 2.9143, + "step": 32700 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021154093490448973, + "loss": 2.9192, + "step": 32800 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002112692130027987, + "loss": 2.9187, + "step": 32900 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021099749110110768, + "loss": 2.9267, + "step": 33000 + }, + { + "epoch": 0.3, + "eval_accuracy": 0.4452544059425405, + "eval_loss": 2.9316306114196777, + "eval_runtime": 45.514, + "eval_samples_per_second": 142.44, + "eval_steps_per_second": 2.395, + "step": 33000 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021072576919941669, + "loss": 2.9169, + "step": 33100 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002104540472977257, + "loss": 2.9219, + "step": 33200 + }, + { + "epoch": 0.3, + "learning_rate": 0.00021018232539603464, + "loss": 2.9096, + "step": 33300 + }, + { + "epoch": 0.3, + "learning_rate": 0.00020991060349434364, + "loss": 2.9202, + "step": 33400 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002096388815926526, + "loss": 2.9241, + "step": 33500 + }, + { + "epoch": 0.3, + "learning_rate": 0.00020936987690997852, + "loss": 2.9148, + "step": 33600 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002090981550082875, + "loss": 2.9194, + "step": 33700 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002088264331065965, + "loss": 2.9267, + "step": 33800 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020855471120490544, + "loss": 2.9164, + "step": 33900 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020828298930321445, + "loss": 2.9151, + "step": 34000 + }, + { + "epoch": 0.31, + "eval_accuracy": 0.44544252993500344, + "eval_loss": 2.931532382965088, + "eval_runtime": 43.496, + "eval_samples_per_second": 149.048, + "eval_steps_per_second": 2.506, + "step": 34000 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020801126740152345, + "loss": 2.9178, + "step": 34100 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020773954549983242, + "loss": 2.9119, + "step": 34200 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020746782359814143, + "loss": 2.9143, + "step": 34300 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020719610169645037, + "loss": 2.9084, + "step": 34400 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020692437979475938, + "loss": 2.9227, + "step": 34500 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020665265789306835, + "loss": 2.9159, + "step": 34600 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020638365321039425, + "loss": 2.9151, + "step": 34700 + }, + { + "epoch": 0.31, + "learning_rate": 0.00020611193130870323, + "loss": 2.9218, + "step": 34800 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020584020940701223, + "loss": 2.9169, + "step": 34900 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020556848750532118, + "loss": 2.9105, + "step": 35000 + }, + { + "epoch": 0.32, + "eval_accuracy": 0.4455647197950598, + "eval_loss": 2.928622245788574, + "eval_runtime": 45.1155, + "eval_samples_per_second": 143.698, + "eval_steps_per_second": 2.416, + "step": 35000 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020529676560363018, + "loss": 2.9135, + "step": 35100 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020502504370193919, + "loss": 2.9099, + "step": 35200 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020475332180024816, + "loss": 2.9114, + "step": 35300 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020448159989855714, + "loss": 2.9169, + "step": 35400 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002042098779968661, + "loss": 2.9098, + "step": 35500 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020393815609517511, + "loss": 2.9126, + "step": 35600 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020366643419348412, + "loss": 2.9095, + "step": 35700 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020339471229179306, + "loss": 2.9086, + "step": 35800 + }, + { + "epoch": 0.32, + "learning_rate": 0.00020312299039010207, + "loss": 2.9077, + "step": 35900 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020285126848841104, + "loss": 2.9053, + "step": 36000 + }, + { + "epoch": 0.33, + "eval_accuracy": 0.4457353016789008, + "eval_loss": 2.924194097518921, + "eval_runtime": 41.9708, + "eval_samples_per_second": 154.464, + "eval_steps_per_second": 2.597, + "step": 36000 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020257954658672004, + "loss": 2.9099, + "step": 36100 + }, + { + "epoch": 0.33, + "learning_rate": 0.000202307824685029, + "loss": 2.9118, + "step": 36200 + }, + { + "epoch": 0.33, + "learning_rate": 0.000202036102783338, + "loss": 2.91, + "step": 36300 + }, + { + "epoch": 0.33, + "learning_rate": 0.000201764380881647, + "loss": 2.8983, + "step": 36400 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020149265897995595, + "loss": 2.8964, + "step": 36500 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020122093707826495, + "loss": 2.9024, + "step": 36600 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020095193239559085, + "loss": 2.9057, + "step": 36700 + }, + { + "epoch": 0.33, + "learning_rate": 0.00020068021049389983, + "loss": 2.9094, + "step": 36800 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002004084885922088, + "loss": 2.9071, + "step": 36900 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002001367666905178, + "loss": 2.9023, + "step": 37000 + }, + { + "epoch": 0.33, + "eval_accuracy": 0.44664325702516083, + "eval_loss": 2.9194602966308594, + "eval_runtime": 42.9573, + "eval_samples_per_second": 150.917, + "eval_steps_per_second": 2.537, + "step": 37000 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019986504478882678, + "loss": 2.9047, + "step": 37100 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019959332288713575, + "loss": 2.9097, + "step": 37200 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019932160098544476, + "loss": 2.908, + "step": 37300 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019905259630277066, + "loss": 2.9019, + "step": 37400 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001987808744010796, + "loss": 2.9105, + "step": 37500 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001985091524993886, + "loss": 2.9064, + "step": 37600 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001982401478167145, + "loss": 2.9053, + "step": 37700 + }, + { + "epoch": 0.34, + "learning_rate": 0.0001979684259150235, + "loss": 2.906, + "step": 37800 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019769670401333247, + "loss": 2.8997, + "step": 37900 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019742498211164147, + "loss": 2.8946, + "step": 38000 + }, + { + "epoch": 0.34, + "eval_accuracy": 0.4468059752051368, + "eval_loss": 2.917731285095215, + "eval_runtime": 43.2928, + "eval_samples_per_second": 149.748, + "eval_steps_per_second": 2.518, + "step": 38000 + }, + { + "epoch": 0.34, + "learning_rate": 0.00019715326020995042, + "loss": 2.9018, + "step": 38100 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019688153830825942, + "loss": 2.8969, + "step": 38200 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019660981640656842, + "loss": 2.9104, + "step": 38300 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001963380945048774, + "loss": 2.9057, + "step": 38400 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019606637260318637, + "loss": 2.9094, + "step": 38500 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019579465070149535, + "loss": 2.9008, + "step": 38600 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019552292879980435, + "loss": 2.8998, + "step": 38700 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019525120689811335, + "loss": 2.9019, + "step": 38800 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001949794849964223, + "loss": 2.8925, + "step": 38900 + }, + { + "epoch": 0.35, + "learning_rate": 0.0001947077630947313, + "loss": 2.9037, + "step": 39000 + }, + { + "epoch": 0.35, + "eval_accuracy": 0.44703039321543825, + "eval_loss": 2.9147427082061768, + "eval_runtime": 43.7223, + "eval_samples_per_second": 148.277, + "eval_steps_per_second": 2.493, + "step": 39000 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019443604119304028, + "loss": 2.9052, + "step": 39100 + }, + { + "epoch": 0.35, + "learning_rate": 0.00019416431929134928, + "loss": 2.9038, + "step": 39200 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019389259738965823, + "loss": 2.9046, + "step": 39300 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019362087548796723, + "loss": 2.903, + "step": 39400 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019334915358627623, + "loss": 2.8919, + "step": 39500 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019307743168458518, + "loss": 2.8936, + "step": 39600 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019280570978289418, + "loss": 2.8985, + "step": 39700 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019253398788120316, + "loss": 2.8955, + "step": 39800 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019226226597951216, + "loss": 2.8943, + "step": 39900 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019199326129683804, + "loss": 2.8893, + "step": 40000 + }, + { + "epoch": 0.36, + "eval_accuracy": 0.44681383890900184, + "eval_loss": 2.9129724502563477, + "eval_runtime": 42.9613, + "eval_samples_per_second": 150.903, + "eval_steps_per_second": 2.537, + "step": 40000 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019172153939514704, + "loss": 2.8923, + "step": 40100 + }, + { + "epoch": 0.36, + "learning_rate": 0.00019144981749345602, + "loss": 2.8998, + "step": 40200 + }, + { + "epoch": 0.36, + "learning_rate": 0.000191178095591765, + "loss": 2.8931, + "step": 40300 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019090637369007397, + "loss": 2.8965, + "step": 40400 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019063465178838297, + "loss": 2.8992, + "step": 40500 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019036292988669197, + "loss": 2.8974, + "step": 40600 + }, + { + "epoch": 0.37, + "learning_rate": 0.00019009120798500092, + "loss": 2.8929, + "step": 40700 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018981948608330992, + "loss": 2.8919, + "step": 40800 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001895477641816189, + "loss": 2.8907, + "step": 40900 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001892760422799279, + "loss": 2.8891, + "step": 41000 + }, + { + "epoch": 0.37, + "eval_accuracy": 0.4481204235511882, + "eval_loss": 2.9055044651031494, + "eval_runtime": 43.4382, + "eval_samples_per_second": 149.246, + "eval_steps_per_second": 2.509, + "step": 41000 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018900432037823687, + "loss": 2.8892, + "step": 41100 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018873259847654585, + "loss": 2.8979, + "step": 41200 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018846087657485485, + "loss": 2.8864, + "step": 41300 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001881891546731638, + "loss": 2.8905, + "step": 41400 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001879174327714728, + "loss": 2.8849, + "step": 41500 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001876457108697818, + "loss": 2.8959, + "step": 41600 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018737398896809078, + "loss": 2.8923, + "step": 41700 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018710226706639978, + "loss": 2.8878, + "step": 41800 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018683326238372566, + "loss": 2.8848, + "step": 41900 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018656154048203463, + "loss": 2.8851, + "step": 42000 + }, + { + "epoch": 0.38, + "eval_accuracy": 0.4484996960376006, + "eval_loss": 2.90169358253479, + "eval_runtime": 44.5924, + "eval_samples_per_second": 145.384, + "eval_steps_per_second": 2.444, + "step": 42000 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001862898185803436, + "loss": 2.8892, + "step": 42100 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001860180966786526, + "loss": 2.8835, + "step": 42200 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001857463747769616, + "loss": 2.8868, + "step": 42300 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001854746528752706, + "loss": 2.89, + "step": 42400 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018520293097357954, + "loss": 2.8903, + "step": 42500 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018493120907188854, + "loss": 2.8868, + "step": 42600 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018466220438921442, + "loss": 2.8882, + "step": 42700 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018439048248752342, + "loss": 2.8788, + "step": 42800 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001841187605858324, + "loss": 2.8884, + "step": 42900 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001838470386841414, + "loss": 2.8909, + "step": 43000 + }, + { + "epoch": 0.39, + "eval_accuracy": 0.44834423666119233, + "eval_loss": 2.9010777473449707, + "eval_runtime": 43.3319, + "eval_samples_per_second": 149.613, + "eval_steps_per_second": 2.515, + "step": 43000 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001835753167824504, + "loss": 2.8868, + "step": 43100 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018330359488075935, + "loss": 2.8935, + "step": 43200 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018303187297906835, + "loss": 2.883, + "step": 43300 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018276015107737733, + "loss": 2.8895, + "step": 43400 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001824911463947032, + "loss": 2.8958, + "step": 43500 + }, + { + "epoch": 0.39, + "learning_rate": 0.0001822194244930122, + "loss": 2.8916, + "step": 43600 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001819477025913212, + "loss": 2.8949, + "step": 43700 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018167869790864708, + "loss": 2.8898, + "step": 43800 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018140697600695606, + "loss": 2.8887, + "step": 43900 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018113525410526506, + "loss": 2.896, + "step": 44000 + }, + { + "epoch": 0.4, + "eval_accuracy": 0.4478663654263186, + "eval_loss": 2.9061102867126465, + "eval_runtime": 43.1173, + "eval_samples_per_second": 150.357, + "eval_steps_per_second": 2.528, + "step": 44000 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018086353220357404, + "loss": 2.8965, + "step": 44100 + }, + { + "epoch": 0.4, + "learning_rate": 0.000180591810301883, + "loss": 2.8969, + "step": 44200 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018032008840019201, + "loss": 2.8913, + "step": 44300 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018004836649850096, + "loss": 2.8897, + "step": 44400 + }, + { + "epoch": 0.4, + "learning_rate": 0.00017977664459680996, + "loss": 2.8952, + "step": 44500 + }, + { + "epoch": 0.4, + "learning_rate": 0.00017950492269511897, + "loss": 2.9008, + "step": 44600 + }, + { + "epoch": 0.4, + "learning_rate": 0.00017923320079342794, + "loss": 2.8884, + "step": 44700 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017896147889173694, + "loss": 2.8971, + "step": 44800 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001786897569900459, + "loss": 2.8824, + "step": 44900 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001784180350883549, + "loss": 2.8918, + "step": 45000 + }, + { + "epoch": 0.41, + "eval_accuracy": 0.44788874673731904, + "eval_loss": 2.90425443649292, + "eval_runtime": 45.928, + "eval_samples_per_second": 141.156, + "eval_steps_per_second": 2.373, + "step": 45000 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017814631318666387, + "loss": 2.886, + "step": 45100 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017787459128497285, + "loss": 2.8935, + "step": 45200 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017760286938328185, + "loss": 2.8851, + "step": 45300 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017733114748159082, + "loss": 2.8869, + "step": 45400 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017705942557989983, + "loss": 2.8816, + "step": 45500 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017678770367820877, + "loss": 2.8726, + "step": 45600 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017651598177651778, + "loss": 2.8815, + "step": 45700 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017624425987482678, + "loss": 2.8835, + "step": 45800 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017597253797313575, + "loss": 2.8814, + "step": 45900 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017570081607144473, + "loss": 2.8847, + "step": 46000 + }, + { + "epoch": 0.42, + "eval_accuracy": 0.4490059975864478, + "eval_loss": 2.89544415473938, + "eval_runtime": 42.9804, + "eval_samples_per_second": 150.836, + "eval_steps_per_second": 2.536, + "step": 46000 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001754290941697537, + "loss": 2.8699, + "step": 46100 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001751573722680627, + "loss": 2.8829, + "step": 46200 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017488565036637165, + "loss": 2.8773, + "step": 46300 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017461392846468066, + "loss": 2.8812, + "step": 46400 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017434220656298966, + "loss": 2.8805, + "step": 46500 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017407048466129863, + "loss": 2.8812, + "step": 46600 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017379876275960764, + "loss": 2.8826, + "step": 46700 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017352704085791659, + "loss": 2.8801, + "step": 46800 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017325803617524252, + "loss": 2.8787, + "step": 46900 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017298631427355146, + "loss": 2.8749, + "step": 47000 + }, + { + "epoch": 0.42, + "eval_accuracy": 0.44940160238088755, + "eval_loss": 2.8912456035614014, + "eval_runtime": 43.8328, + "eval_samples_per_second": 147.903, + "eval_steps_per_second": 2.487, + "step": 47000 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017271730959087737, + "loss": 2.8715, + "step": 47100 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017244558768918637, + "loss": 2.8804, + "step": 47200 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017217386578749535, + "loss": 2.8802, + "step": 47300 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017190214388580432, + "loss": 2.8779, + "step": 47400 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017163042198411332, + "loss": 2.878, + "step": 47500 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017135870008242227, + "loss": 2.8835, + "step": 47600 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017108697818073127, + "loss": 2.8758, + "step": 47700 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017081525627904025, + "loss": 2.8751, + "step": 47800 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017054353437734925, + "loss": 2.8737, + "step": 47900 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017027181247565825, + "loss": 2.8832, + "step": 48000 + }, + { + "epoch": 0.43, + "eval_accuracy": 0.4496018243792967, + "eval_loss": 2.891221761703491, + "eval_runtime": 43.1479, + "eval_samples_per_second": 150.251, + "eval_steps_per_second": 2.526, + "step": 48000 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001700000905739672, + "loss": 2.8757, + "step": 48100 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001697283686722762, + "loss": 2.8725, + "step": 48200 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016945664677058518, + "loss": 2.8749, + "step": 48300 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016918492486889416, + "loss": 2.8747, + "step": 48400 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016891320296720316, + "loss": 2.8724, + "step": 48500 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016864148106551213, + "loss": 2.8717, + "step": 48600 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016836975916382114, + "loss": 2.8653, + "step": 48700 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016809803726213008, + "loss": 2.869, + "step": 48800 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016782631536043909, + "loss": 2.8763, + "step": 48900 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001675545934587481, + "loss": 2.8745, + "step": 49000 + }, + { + "epoch": 0.44, + "eval_accuracy": 0.45002646438800725, + "eval_loss": 2.8852970600128174, + "eval_runtime": 43.6365, + "eval_samples_per_second": 148.568, + "eval_steps_per_second": 2.498, + "step": 49000 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016728287155705706, + "loss": 2.8753, + "step": 49100 + }, + { + "epoch": 0.44, + "learning_rate": 0.00016701114965536604, + "loss": 2.8684, + "step": 49200 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016673942775367501, + "loss": 2.8711, + "step": 49300 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016646770585198402, + "loss": 2.8646, + "step": 49400 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016619598395029296, + "loss": 2.865, + "step": 49500 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001659269792676189, + "loss": 2.8773, + "step": 49600 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016565525736592787, + "loss": 2.8703, + "step": 49700 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016538353546423687, + "loss": 2.8722, + "step": 49800 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016511181356254582, + "loss": 2.8713, + "step": 49900 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016484009166085482, + "loss": 2.8717, + "step": 50000 + }, + { + "epoch": 0.45, + "eval_accuracy": 0.45021942758284866, + "eval_loss": 2.8834283351898193, + "eval_runtime": 43.5477, + "eval_samples_per_second": 148.871, + "eval_steps_per_second": 2.503, + "step": 50000 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016456836975916383, + "loss": 2.8727, + "step": 50100 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016429664785747277, + "loss": 2.8622, + "step": 50200 + }, + { + "epoch": 0.45, + "learning_rate": 0.00016402492595578178, + "loss": 2.8707, + "step": 50300 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016375320405409075, + "loss": 2.8645, + "step": 50400 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016348148215239975, + "loss": 2.8642, + "step": 50500 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016321247746972563, + "loss": 2.8679, + "step": 50600 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016294075556803463, + "loss": 2.871, + "step": 50700 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001626690336663436, + "loss": 2.867, + "step": 50800 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016239731176465258, + "loss": 2.8643, + "step": 50900 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016212558986296156, + "loss": 2.8659, + "step": 51000 + }, + { + "epoch": 0.46, + "eval_accuracy": 0.45029624992060685, + "eval_loss": 2.883072853088379, + "eval_runtime": 43.5545, + "eval_samples_per_second": 148.848, + "eval_steps_per_second": 2.503, + "step": 51000 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016185386796127056, + "loss": 2.8694, + "step": 51100 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016158214605957956, + "loss": 2.8671, + "step": 51200 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001613104241578885, + "loss": 2.8624, + "step": 51300 + }, + { + "epoch": 0.46, + "learning_rate": 0.00016103870225619751, + "loss": 2.8665, + "step": 51400 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001607669803545065, + "loss": 2.8613, + "step": 51500 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016049525845281547, + "loss": 2.8637, + "step": 51600 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016022353655112447, + "loss": 2.8662, + "step": 51700 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015995181464943344, + "loss": 2.8652, + "step": 51800 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015968009274774245, + "loss": 2.8673, + "step": 51900 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001594083708460514, + "loss": 2.865, + "step": 52000 + }, + { + "epoch": 0.47, + "eval_accuracy": 0.450486793514259, + "eval_loss": 2.878352403640747, + "eval_runtime": 43.3417, + "eval_samples_per_second": 149.579, + "eval_steps_per_second": 2.515, + "step": 52000 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001591366489443604, + "loss": 2.8688, + "step": 52100 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015886492704266937, + "loss": 2.862, + "step": 52200 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015859320514097837, + "loss": 2.8646, + "step": 52300 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015832148323928735, + "loss": 2.8672, + "step": 52400 + }, + { + "epoch": 0.47, + "learning_rate": 0.00015804976133759632, + "loss": 2.8594, + "step": 52500 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015777803943590533, + "loss": 2.8558, + "step": 52600 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001575063175342143, + "loss": 2.8576, + "step": 52700 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001572373128515402, + "loss": 2.8597, + "step": 52800 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015696559094984918, + "loss": 2.8615, + "step": 52900 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015669386904815818, + "loss": 2.8575, + "step": 53000 + }, + { + "epoch": 0.48, + "eval_accuracy": 0.45082372297985984, + "eval_loss": 2.8763039112091064, + "eval_runtime": 43.6525, + "eval_samples_per_second": 148.514, + "eval_steps_per_second": 2.497, + "step": 53000 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015642214714646713, + "loss": 2.8673, + "step": 53100 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015615042524477613, + "loss": 2.854, + "step": 53200 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015587870334308514, + "loss": 2.8652, + "step": 53300 + }, + { + "epoch": 0.48, + "learning_rate": 0.000155609698660411, + "loss": 2.8596, + "step": 53400 + }, + { + "epoch": 0.48, + "learning_rate": 0.00015533797675872, + "loss": 2.8641, + "step": 53500 + }, + { + "epoch": 0.48, + "learning_rate": 0.000155066254857029, + "loss": 2.8595, + "step": 53600 + }, + { + "epoch": 0.49, + "learning_rate": 0.000154794532955338, + "loss": 2.8562, + "step": 53700 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015452281105364694, + "loss": 2.8529, + "step": 53800 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015425108915195594, + "loss": 2.8629, + "step": 53900 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015397936725026492, + "loss": 2.8571, + "step": 54000 + }, + { + "epoch": 0.49, + "eval_accuracy": 0.4512689295986789, + "eval_loss": 2.874122142791748, + "eval_runtime": 43.0942, + "eval_samples_per_second": 150.438, + "eval_steps_per_second": 2.529, + "step": 54000 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001537076453485739, + "loss": 2.8605, + "step": 54100 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015343592344688287, + "loss": 2.8668, + "step": 54200 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015316420154519187, + "loss": 2.8604, + "step": 54300 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015289247964350087, + "loss": 2.857, + "step": 54400 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015262075774180982, + "loss": 2.8599, + "step": 54500 + }, + { + "epoch": 0.49, + "learning_rate": 0.00015234903584011882, + "loss": 2.8653, + "step": 54600 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001520773139384278, + "loss": 2.857, + "step": 54700 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001518055920367368, + "loss": 2.8543, + "step": 54800 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015153658735406268, + "loss": 2.8495, + "step": 54900 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015126486545237168, + "loss": 2.8554, + "step": 55000 + }, + { + "epoch": 0.5, + "eval_accuracy": 0.4514479800866822, + "eval_loss": 2.870398998260498, + "eval_runtime": 43.838, + "eval_samples_per_second": 147.885, + "eval_steps_per_second": 2.486, + "step": 55000 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015099314355068063, + "loss": 2.8595, + "step": 55100 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015072142164898963, + "loss": 2.855, + "step": 55200 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001504496997472986, + "loss": 2.8663, + "step": 55300 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001501779778456076, + "loss": 2.8555, + "step": 55400 + }, + { + "epoch": 0.5, + "learning_rate": 0.00014990625594391658, + "loss": 2.8596, + "step": 55500 + }, + { + "epoch": 0.5, + "learning_rate": 0.00014963453404222556, + "loss": 2.8589, + "step": 55600 + }, + { + "epoch": 0.5, + "learning_rate": 0.00014936281214053456, + "loss": 2.8568, + "step": 55700 + }, + { + "epoch": 0.5, + "learning_rate": 0.00014909109023884354, + "loss": 2.8474, + "step": 55800 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001488193683371525, + "loss": 2.8515, + "step": 55900 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014854764643546151, + "loss": 2.8526, + "step": 56000 + }, + { + "epoch": 0.51, + "eval_accuracy": 0.45189379160579857, + "eval_loss": 2.86692214012146, + "eval_runtime": 43.3506, + "eval_samples_per_second": 149.548, + "eval_steps_per_second": 2.514, + "step": 56000 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001482759245337705, + "loss": 2.8504, + "step": 56100 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001480042026320795, + "loss": 2.854, + "step": 56200 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014773248073038847, + "loss": 2.8512, + "step": 56300 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014746075882869744, + "loss": 2.8515, + "step": 56400 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014718903692700642, + "loss": 2.8492, + "step": 56500 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014691731502531542, + "loss": 2.8491, + "step": 56600 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001466455931236244, + "loss": 2.8466, + "step": 56700 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001463738712219334, + "loss": 2.8508, + "step": 56800 + }, + { + "epoch": 0.51, + "learning_rate": 0.00014610214932024237, + "loss": 2.8567, + "step": 56900 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014583042741855135, + "loss": 2.8521, + "step": 57000 + }, + { + "epoch": 0.52, + "eval_accuracy": 0.45249203799983667, + "eval_loss": 2.861818552017212, + "eval_runtime": 43.168, + "eval_samples_per_second": 150.181, + "eval_steps_per_second": 2.525, + "step": 57000 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014555870551686032, + "loss": 2.8463, + "step": 57100 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001452869836151693, + "loss": 2.8433, + "step": 57200 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001450152617134783, + "loss": 2.8446, + "step": 57300 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014474353981178728, + "loss": 2.8477, + "step": 57400 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014447181791009628, + "loss": 2.8439, + "step": 57500 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014420009600840525, + "loss": 2.8459, + "step": 57600 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014392837410671423, + "loss": 2.8445, + "step": 57700 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001436566522050232, + "loss": 2.8455, + "step": 57800 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001433876475223491, + "loss": 2.8474, + "step": 57900 + }, + { + "epoch": 0.52, + "learning_rate": 0.000143118642839675, + "loss": 2.8398, + "step": 58000 + }, + { + "epoch": 0.52, + "eval_accuracy": 0.45218656334969587, + "eval_loss": 2.8599517345428467, + "eval_runtime": 43.8444, + "eval_samples_per_second": 147.864, + "eval_steps_per_second": 2.486, + "step": 58000 + }, + { + "epoch": 0.53, + "learning_rate": 0.000142846920937984, + "loss": 2.8492, + "step": 58100 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014257519903629296, + "loss": 2.8434, + "step": 58200 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014230347713460197, + "loss": 2.8483, + "step": 58300 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014203175523291094, + "loss": 2.8441, + "step": 58400 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014176003333121992, + "loss": 2.8474, + "step": 58500 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014148831142952892, + "loss": 2.8385, + "step": 58600 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001412165895278379, + "loss": 2.8424, + "step": 58700 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014094486762614687, + "loss": 2.847, + "step": 58800 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014067314572445587, + "loss": 2.8511, + "step": 58900 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014040142382276485, + "loss": 2.8398, + "step": 59000 + }, + { + "epoch": 0.53, + "eval_accuracy": 0.45275395982857125, + "eval_loss": 2.8576090335845947, + "eval_runtime": 43.2028, + "eval_samples_per_second": 150.06, + "eval_steps_per_second": 2.523, + "step": 59000 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014012970192107382, + "loss": 2.8386, + "step": 59100 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013985798001938282, + "loss": 2.8458, + "step": 59200 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001395862581176918, + "loss": 2.8356, + "step": 59300 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013931453621600078, + "loss": 2.8379, + "step": 59400 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013904281431430978, + "loss": 2.8325, + "step": 59500 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013877109241261875, + "loss": 2.8461, + "step": 59600 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013849937051092773, + "loss": 2.8521, + "step": 59700 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013823036582825363, + "loss": 2.8273, + "step": 59800 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013795864392656263, + "loss": 2.8318, + "step": 59900 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001376869220248716, + "loss": 2.837, + "step": 60000 + }, + { + "epoch": 0.54, + "eval_accuracy": 0.4528289674654375, + "eval_loss": 2.8535568714141846, + "eval_runtime": 43.1874, + "eval_samples_per_second": 150.113, + "eval_steps_per_second": 2.524, + "step": 60000 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013741520012318058, + "loss": 2.8396, + "step": 60100 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013714347822148956, + "loss": 2.8395, + "step": 60200 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013687447353881546, + "loss": 2.8325, + "step": 60300 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013660275163712444, + "loss": 2.8412, + "step": 60400 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013633102973543344, + "loss": 2.8392, + "step": 60500 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013605930783374242, + "loss": 2.843, + "step": 60600 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001357875859320514, + "loss": 2.8337, + "step": 60700 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013551586403036037, + "loss": 2.8452, + "step": 60800 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013524414212866937, + "loss": 2.8448, + "step": 60900 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013497242022697835, + "loss": 2.837, + "step": 61000 + }, + { + "epoch": 0.55, + "eval_accuracy": 0.4534701617805845, + "eval_loss": 2.851900577545166, + "eval_runtime": 43.1282, + "eval_samples_per_second": 150.319, + "eval_steps_per_second": 2.527, + "step": 61000 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013470069832528735, + "loss": 2.8331, + "step": 61100 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013442897642359632, + "loss": 2.832, + "step": 61200 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001341572545219053, + "loss": 2.8255, + "step": 61300 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013388553262021427, + "loss": 2.8327, + "step": 61400 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013361381071852328, + "loss": 2.8386, + "step": 61500 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013334208881683225, + "loss": 2.8315, + "step": 61600 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013307036691514125, + "loss": 2.824, + "step": 61700 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013279864501345023, + "loss": 2.8296, + "step": 61800 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001325269231117592, + "loss": 2.8378, + "step": 61900 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001322579184290851, + "loss": 2.8427, + "step": 62000 + }, + { + "epoch": 0.56, + "eval_accuracy": 0.4535663409278566, + "eval_loss": 2.8492891788482666, + "eval_runtime": 43.4858, + "eval_samples_per_second": 149.083, + "eval_steps_per_second": 2.507, + "step": 62000 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013198619652739408, + "loss": 2.8329, + "step": 62100 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013171447462570306, + "loss": 2.8389, + "step": 62200 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013144275272401206, + "loss": 2.8358, + "step": 62300 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013117103082232104, + "loss": 2.8369, + "step": 62400 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013089930892063, + "loss": 2.8294, + "step": 62500 + }, + { + "epoch": 0.57, + "learning_rate": 0.000130627587018939, + "loss": 2.834, + "step": 62600 + }, + { + "epoch": 0.57, + "learning_rate": 0.000130355865117248, + "loss": 2.8414, + "step": 62700 + }, + { + "epoch": 0.57, + "learning_rate": 0.00013008414321555696, + "loss": 2.8384, + "step": 62800 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012981242131386597, + "loss": 2.8384, + "step": 62900 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012954069941217494, + "loss": 2.8365, + "step": 63000 + }, + { + "epoch": 0.57, + "eval_accuracy": 0.45409986299008265, + "eval_loss": 2.8467965126037598, + "eval_runtime": 47.1796, + "eval_samples_per_second": 137.411, + "eval_steps_per_second": 2.31, + "step": 63000 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012926897751048392, + "loss": 2.8281, + "step": 63100 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012899725560879292, + "loss": 2.8197, + "step": 63200 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001287255337071019, + "loss": 2.8233, + "step": 63300 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012845652902442777, + "loss": 2.828, + "step": 63400 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012818480712273677, + "loss": 2.8334, + "step": 63500 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012791308522104578, + "loss": 2.8332, + "step": 63600 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012764136331935475, + "loss": 2.8279, + "step": 63700 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012736964141766373, + "loss": 2.8271, + "step": 63800 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001270979195159727, + "loss": 2.8306, + "step": 63900 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012682619761428168, + "loss": 2.8327, + "step": 64000 + }, + { + "epoch": 0.58, + "eval_accuracy": 0.4538736302788893, + "eval_loss": 2.8447225093841553, + "eval_runtime": 44.4204, + "eval_samples_per_second": 145.946, + "eval_steps_per_second": 2.454, + "step": 64000 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012655447571259068, + "loss": 2.836, + "step": 64100 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012628275381089965, + "loss": 2.8337, + "step": 64200 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012601103190920866, + "loss": 2.8333, + "step": 64300 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012573931000751763, + "loss": 2.8298, + "step": 64400 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001254675881058266, + "loss": 2.8285, + "step": 64500 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012519586620413558, + "loss": 2.8252, + "step": 64600 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012492414430244459, + "loss": 2.8227, + "step": 64700 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012465242240075356, + "loss": 2.8286, + "step": 64800 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012438070049906256, + "loss": 2.8218, + "step": 64900 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012410897859737154, + "loss": 2.8289, + "step": 65000 + }, + { + "epoch": 0.59, + "eval_accuracy": 0.4545583774154425, + "eval_loss": 2.838773012161255, + "eval_runtime": 43.8892, + "eval_samples_per_second": 147.713, + "eval_steps_per_second": 2.484, + "step": 65000 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001238372566956805, + "loss": 2.8198, + "step": 65100 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001235655347939895, + "loss": 2.8207, + "step": 65200 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012329381289229846, + "loss": 2.8296, + "step": 65300 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012302209099060747, + "loss": 2.8293, + "step": 65400 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012275036908891647, + "loss": 2.8188, + "step": 65500 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012247864718722544, + "loss": 2.819, + "step": 65600 + }, + { + "epoch": 0.59, + "learning_rate": 0.00012220692528553442, + "loss": 2.8219, + "step": 65700 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001219352033838434, + "loss": 2.8199, + "step": 65800 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001216634814821524, + "loss": 2.8282, + "step": 65900 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012139175958046137, + "loss": 2.8166, + "step": 66000 + }, + { + "epoch": 0.6, + "eval_accuracy": 0.45473863770404044, + "eval_loss": 2.834634780883789, + "eval_runtime": 43.1108, + "eval_samples_per_second": 150.38, + "eval_steps_per_second": 2.528, + "step": 66000 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012112003767877036, + "loss": 2.8226, + "step": 66100 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012084831577707934, + "loss": 2.8135, + "step": 66200 + }, + { + "epoch": 0.6, + "learning_rate": 0.00012057659387538832, + "loss": 2.8134, + "step": 66300 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001203048719736973, + "loss": 2.8214, + "step": 66400 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001200358672910232, + "loss": 2.8142, + "step": 66500 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011976414538933219, + "loss": 2.8196, + "step": 66600 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011949242348764117, + "loss": 2.8145, + "step": 66700 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011922070158595016, + "loss": 2.8093, + "step": 66800 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011894897968425913, + "loss": 2.8168, + "step": 66900 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011867725778256813, + "loss": 2.8171, + "step": 67000 + }, + { + "epoch": 0.61, + "eval_accuracy": 0.45580810142968187, + "eval_loss": 2.8293869495391846, + "eval_runtime": 44.4137, + "eval_samples_per_second": 145.968, + "eval_steps_per_second": 2.454, + "step": 67000 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011840553588087711, + "loss": 2.8123, + "step": 67100 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001181338139791861, + "loss": 2.8121, + "step": 67200 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011786209207749507, + "loss": 2.8083, + "step": 67300 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011759037017580405, + "loss": 2.8156, + "step": 67400 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011731864827411304, + "loss": 2.8225, + "step": 67500 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011704692637242204, + "loss": 2.8109, + "step": 67600 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011677520447073102, + "loss": 2.8137, + "step": 67700 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001165061997880569, + "loss": 2.8097, + "step": 67800 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011623447788636588, + "loss": 2.8099, + "step": 67900 + }, + { + "epoch": 0.61, + "learning_rate": 0.00011596275598467488, + "loss": 2.8184, + "step": 68000 + }, + { + "epoch": 0.61, + "eval_accuracy": 0.4556344950443543, + "eval_loss": 2.826944589614868, + "eval_runtime": 43.7297, + "eval_samples_per_second": 148.252, + "eval_steps_per_second": 2.493, + "step": 68000 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011569103408298386, + "loss": 2.8164, + "step": 68100 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011541931218129285, + "loss": 2.8137, + "step": 68200 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011514759027960182, + "loss": 2.8168, + "step": 68300 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011487858559692771, + "loss": 2.8156, + "step": 68400 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011460686369523672, + "loss": 2.8114, + "step": 68500 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011433514179354569, + "loss": 2.8066, + "step": 68600 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011406341989185468, + "loss": 2.8124, + "step": 68700 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011379169799016366, + "loss": 2.8093, + "step": 68800 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011351997608847263, + "loss": 2.8131, + "step": 68900 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011324825418678162, + "loss": 2.8102, + "step": 69000 + }, + { + "epoch": 0.62, + "eval_accuracy": 0.45632710588477254, + "eval_loss": 2.8243494033813477, + "eval_runtime": 42.7646, + "eval_samples_per_second": 151.597, + "eval_steps_per_second": 2.549, + "step": 69000 + }, + { + "epoch": 0.62, + "learning_rate": 0.00011297653228509062, + "loss": 2.8064, + "step": 69100 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001127048103833996, + "loss": 2.8075, + "step": 69200 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011243308848170857, + "loss": 2.8146, + "step": 69300 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011216136658001756, + "loss": 2.8166, + "step": 69400 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011188964467832654, + "loss": 2.8073, + "step": 69500 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011161792277663554, + "loss": 2.8116, + "step": 69600 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011134620087494451, + "loss": 2.807, + "step": 69700 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001110744789732535, + "loss": 2.8066, + "step": 69800 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011080547429057939, + "loss": 2.8101, + "step": 69900 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011053375238888837, + "loss": 2.8153, + "step": 70000 + }, + { + "epoch": 0.63, + "eval_accuracy": 0.45636279500231375, + "eval_loss": 2.821134328842163, + "eval_runtime": 42.931, + "eval_samples_per_second": 151.01, + "eval_steps_per_second": 2.539, + "step": 70000 + }, + { + "epoch": 0.63, + "learning_rate": 0.00011026203048719737, + "loss": 2.8109, + "step": 70100 + }, + { + "epoch": 0.63, + "learning_rate": 0.00010999030858550635, + "loss": 2.8025, + "step": 70200 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010971858668381533, + "loss": 2.8055, + "step": 70300 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010944686478212431, + "loss": 2.8047, + "step": 70400 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010917514288043329, + "loss": 2.8095, + "step": 70500 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010890342097874227, + "loss": 2.805, + "step": 70600 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010863169907705128, + "loss": 2.8079, + "step": 70700 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010835997717536025, + "loss": 2.8071, + "step": 70800 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010809097249268614, + "loss": 2.8016, + "step": 70900 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010781925059099512, + "loss": 2.8035, + "step": 71000 + }, + { + "epoch": 0.64, + "eval_accuracy": 0.4569090199707833, + "eval_loss": 2.8184897899627686, + "eval_runtime": 43.5955, + "eval_samples_per_second": 148.708, + "eval_steps_per_second": 2.5, + "step": 71000 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010755024590832102, + "loss": 2.8002, + "step": 71100 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010727852400663001, + "loss": 2.8186, + "step": 71200 + }, + { + "epoch": 0.64, + "learning_rate": 0.00010700680210493899, + "loss": 2.8036, + "step": 71300 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010673508020324797, + "loss": 2.8077, + "step": 71400 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010646335830155695, + "loss": 2.8111, + "step": 71500 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010619163639986595, + "loss": 2.8018, + "step": 71600 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010591991449817493, + "loss": 2.8079, + "step": 71700 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010564819259648392, + "loss": 2.8124, + "step": 71800 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010537647069479289, + "loss": 2.807, + "step": 71900 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010510474879310187, + "loss": 2.8042, + "step": 72000 + }, + { + "epoch": 0.65, + "eval_accuracy": 0.4569186983755403, + "eval_loss": 2.8206183910369873, + "eval_runtime": 44.1793, + "eval_samples_per_second": 146.743, + "eval_steps_per_second": 2.467, + "step": 72000 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010483302689141086, + "loss": 2.8066, + "step": 72100 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010456130498971986, + "loss": 2.8088, + "step": 72200 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010428958308802883, + "loss": 2.8036, + "step": 72300 + }, + { + "epoch": 0.65, + "learning_rate": 0.00010401786118633781, + "loss": 2.7985, + "step": 72400 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001037461392846468, + "loss": 2.7981, + "step": 72500 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010347441738295577, + "loss": 2.7993, + "step": 72600 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010320269548126476, + "loss": 2.7999, + "step": 72700 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010293097357957375, + "loss": 2.8009, + "step": 72800 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010265925167788274, + "loss": 2.7943, + "step": 72900 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010238752977619171, + "loss": 2.7984, + "step": 73000 + }, + { + "epoch": 0.66, + "eval_accuracy": 0.457420160722009, + "eval_loss": 2.8137617111206055, + "eval_runtime": 43.507, + "eval_samples_per_second": 149.01, + "eval_steps_per_second": 2.505, + "step": 73000 + }, + { + "epoch": 0.66, + "learning_rate": 0.0001021158078745007, + "loss": 2.7913, + "step": 73100 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010184408597280968, + "loss": 2.8016, + "step": 73200 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010157236407111868, + "loss": 2.7988, + "step": 73300 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010130064216942766, + "loss": 2.792, + "step": 73400 + }, + { + "epoch": 0.66, + "learning_rate": 0.00010103163748675355, + "loss": 2.7926, + "step": 73500 + }, + { + "epoch": 0.67, + "learning_rate": 0.00010075991558506253, + "loss": 2.7796, + "step": 73600 + }, + { + "epoch": 0.67, + "learning_rate": 0.00010048819368337151, + "loss": 2.7971, + "step": 73700 + }, + { + "epoch": 0.67, + "learning_rate": 0.00010021647178168051, + "loss": 2.7974, + "step": 73800 + }, + { + "epoch": 0.67, + "learning_rate": 9.994474987998949e-05, + "loss": 2.7951, + "step": 73900 + }, + { + "epoch": 0.67, + "learning_rate": 9.967302797829848e-05, + "loss": 2.7883, + "step": 74000 + }, + { + "epoch": 0.67, + "eval_accuracy": 0.45740261861338705, + "eval_loss": 2.8111917972564697, + "eval_runtime": 44.0953, + "eval_samples_per_second": 147.023, + "eval_steps_per_second": 2.472, + "step": 74000 + }, + { + "epoch": 0.67, + "learning_rate": 9.940130607660745e-05, + "loss": 2.7898, + "step": 74100 + }, + { + "epoch": 0.67, + "learning_rate": 9.912958417491643e-05, + "loss": 2.7914, + "step": 74200 + }, + { + "epoch": 0.67, + "learning_rate": 9.885786227322542e-05, + "loss": 2.798, + "step": 74300 + }, + { + "epoch": 0.67, + "learning_rate": 9.85861403715344e-05, + "loss": 2.7938, + "step": 74400 + }, + { + "epoch": 0.67, + "learning_rate": 9.83144184698434e-05, + "loss": 2.7927, + "step": 74500 + }, + { + "epoch": 0.67, + "learning_rate": 9.804269656815237e-05, + "loss": 2.7967, + "step": 74600 + }, + { + "epoch": 0.68, + "learning_rate": 9.777369188547826e-05, + "loss": 2.7933, + "step": 74700 + }, + { + "epoch": 0.68, + "learning_rate": 9.750196998378726e-05, + "loss": 2.7913, + "step": 74800 + }, + { + "epoch": 0.68, + "learning_rate": 9.723024808209624e-05, + "loss": 2.7924, + "step": 74900 + }, + { + "epoch": 0.68, + "learning_rate": 9.695852618040523e-05, + "loss": 2.7962, + "step": 75000 + }, + { + "epoch": 0.68, + "eval_accuracy": 0.4583686443881887, + "eval_loss": 2.8055942058563232, + "eval_runtime": 44.8912, + "eval_samples_per_second": 144.416, + "eval_steps_per_second": 2.428, + "step": 75000 + }, + { + "epoch": 0.68, + "learning_rate": 9.66868042787142e-05, + "loss": 2.7848, + "step": 75100 + }, + { + "epoch": 0.68, + "learning_rate": 9.641779959604009e-05, + "loss": 2.7935, + "step": 75200 + }, + { + "epoch": 0.68, + "learning_rate": 9.61460776943491e-05, + "loss": 2.7961, + "step": 75300 + }, + { + "epoch": 0.68, + "learning_rate": 9.587435579265807e-05, + "loss": 2.788, + "step": 75400 + }, + { + "epoch": 0.68, + "learning_rate": 9.560263389096706e-05, + "loss": 2.7934, + "step": 75500 + }, + { + "epoch": 0.68, + "learning_rate": 9.533091198927603e-05, + "loss": 2.7888, + "step": 75600 + }, + { + "epoch": 0.68, + "learning_rate": 9.505919008758501e-05, + "loss": 2.7954, + "step": 75700 + }, + { + "epoch": 0.69, + "learning_rate": 9.4787468185894e-05, + "loss": 2.7934, + "step": 75800 + }, + { + "epoch": 0.69, + "learning_rate": 9.451574628420299e-05, + "loss": 2.7867, + "step": 75900 + }, + { + "epoch": 0.69, + "learning_rate": 9.424402438251197e-05, + "loss": 2.7937, + "step": 76000 + }, + { + "epoch": 0.69, + "eval_accuracy": 0.4582416153257539, + "eval_loss": 2.8068454265594482, + "eval_runtime": 44.3778, + "eval_samples_per_second": 146.087, + "eval_steps_per_second": 2.456, + "step": 76000 + }, + { + "epoch": 0.69, + "learning_rate": 9.397230248082095e-05, + "loss": 2.7933, + "step": 76100 + }, + { + "epoch": 0.69, + "learning_rate": 9.370058057912994e-05, + "loss": 2.7876, + "step": 76200 + }, + { + "epoch": 0.69, + "learning_rate": 9.342885867743891e-05, + "loss": 2.7885, + "step": 76300 + }, + { + "epoch": 0.69, + "learning_rate": 9.31571367757479e-05, + "loss": 2.7859, + "step": 76400 + }, + { + "epoch": 0.69, + "learning_rate": 9.288541487405689e-05, + "loss": 2.7867, + "step": 76500 + }, + { + "epoch": 0.69, + "learning_rate": 9.261369297236588e-05, + "loss": 2.7882, + "step": 76600 + }, + { + "epoch": 0.69, + "learning_rate": 9.234197107067486e-05, + "loss": 2.7874, + "step": 76700 + }, + { + "epoch": 0.69, + "learning_rate": 9.207024916898384e-05, + "loss": 2.79, + "step": 76800 + }, + { + "epoch": 0.7, + "learning_rate": 9.179852726729282e-05, + "loss": 2.7828, + "step": 76900 + }, + { + "epoch": 0.7, + "learning_rate": 9.152680536560182e-05, + "loss": 2.7853, + "step": 77000 + }, + { + "epoch": 0.7, + "eval_accuracy": 0.4587721128864935, + "eval_loss": 2.801090955734253, + "eval_runtime": 43.1479, + "eval_samples_per_second": 150.251, + "eval_steps_per_second": 2.526, + "step": 77000 + }, + { + "epoch": 0.7, + "learning_rate": 9.12550834639108e-05, + "loss": 2.7861, + "step": 77100 + }, + { + "epoch": 0.7, + "learning_rate": 9.098336156221979e-05, + "loss": 2.793, + "step": 77200 + }, + { + "epoch": 0.7, + "learning_rate": 9.071163966052876e-05, + "loss": 2.7914, + "step": 77300 + }, + { + "epoch": 0.7, + "learning_rate": 9.043991775883774e-05, + "loss": 2.7774, + "step": 77400 + }, + { + "epoch": 0.7, + "learning_rate": 9.016819585714673e-05, + "loss": 2.7791, + "step": 77500 + }, + { + "epoch": 0.7, + "learning_rate": 8.989647395545573e-05, + "loss": 2.7837, + "step": 77600 + }, + { + "epoch": 0.7, + "learning_rate": 8.96247520537647e-05, + "loss": 2.779, + "step": 77700 + }, + { + "epoch": 0.7, + "learning_rate": 8.935303015207368e-05, + "loss": 2.7807, + "step": 77800 + }, + { + "epoch": 0.7, + "learning_rate": 8.908130825038267e-05, + "loss": 2.7832, + "step": 77900 + }, + { + "epoch": 0.71, + "learning_rate": 8.880958634869164e-05, + "loss": 2.7798, + "step": 78000 + }, + { + "epoch": 0.71, + "eval_accuracy": 0.4596697849276993, + "eval_loss": 2.795370578765869, + "eval_runtime": 43.9941, + "eval_samples_per_second": 147.361, + "eval_steps_per_second": 2.478, + "step": 78000 + }, + { + "epoch": 0.71, + "learning_rate": 8.853786444700063e-05, + "loss": 2.7851, + "step": 78100 + }, + { + "epoch": 0.71, + "learning_rate": 8.826885976432654e-05, + "loss": 2.7819, + "step": 78200 + }, + { + "epoch": 0.71, + "learning_rate": 8.799713786263551e-05, + "loss": 2.7767, + "step": 78300 + }, + { + "epoch": 0.71, + "learning_rate": 8.77254159609445e-05, + "loss": 2.7745, + "step": 78400 + }, + { + "epoch": 0.71, + "learning_rate": 8.745369405925347e-05, + "loss": 2.7807, + "step": 78500 + }, + { + "epoch": 0.71, + "learning_rate": 8.718197215756246e-05, + "loss": 2.7828, + "step": 78600 + }, + { + "epoch": 0.71, + "learning_rate": 8.691025025587145e-05, + "loss": 2.7768, + "step": 78700 + }, + { + "epoch": 0.71, + "learning_rate": 8.663852835418044e-05, + "loss": 2.7749, + "step": 78800 + }, + { + "epoch": 0.71, + "learning_rate": 8.636680645248942e-05, + "loss": 2.7782, + "step": 78900 + }, + { + "epoch": 0.71, + "learning_rate": 8.60950845507984e-05, + "loss": 2.7851, + "step": 79000 + }, + { + "epoch": 0.71, + "eval_accuracy": 0.4597998384916206, + "eval_loss": 2.7913172245025635, + "eval_runtime": 43.6998, + "eval_samples_per_second": 148.353, + "eval_steps_per_second": 2.494, + "step": 79000 + }, + { + "epoch": 0.72, + "learning_rate": 8.582336264910738e-05, + "loss": 2.7722, + "step": 79100 + }, + { + "epoch": 0.72, + "learning_rate": 8.555435796643328e-05, + "loss": 2.7695, + "step": 79200 + }, + { + "epoch": 0.72, + "learning_rate": 8.528535328375917e-05, + "loss": 2.7732, + "step": 79300 + }, + { + "epoch": 0.72, + "learning_rate": 8.501363138206815e-05, + "loss": 2.7714, + "step": 79400 + }, + { + "epoch": 0.72, + "learning_rate": 8.474190948037714e-05, + "loss": 2.7739, + "step": 79500 + }, + { + "epoch": 0.72, + "learning_rate": 8.447018757868613e-05, + "loss": 2.7733, + "step": 79600 + }, + { + "epoch": 0.72, + "learning_rate": 8.419846567699512e-05, + "loss": 2.773, + "step": 79700 + }, + { + "epoch": 0.72, + "learning_rate": 8.392674377530409e-05, + "loss": 2.7754, + "step": 79800 + }, + { + "epoch": 0.72, + "learning_rate": 8.365502187361308e-05, + "loss": 2.7817, + "step": 79900 + }, + { + "epoch": 0.72, + "learning_rate": 8.338329997192206e-05, + "loss": 2.7831, + "step": 80000 + }, + { + "epoch": 0.72, + "eval_accuracy": 0.46004845251381443, + "eval_loss": 2.78973126411438, + "eval_runtime": 44.9439, + "eval_samples_per_second": 144.247, + "eval_steps_per_second": 2.425, + "step": 80000 + }, + { + "epoch": 0.72, + "learning_rate": 8.311157807023106e-05, + "loss": 2.7739, + "step": 80100 + }, + { + "epoch": 0.73, + "learning_rate": 8.283985616854003e-05, + "loss": 2.781, + "step": 80200 + }, + { + "epoch": 0.73, + "learning_rate": 8.256813426684902e-05, + "loss": 2.7773, + "step": 80300 + }, + { + "epoch": 0.73, + "learning_rate": 8.2296412365158e-05, + "loss": 2.7688, + "step": 80400 + }, + { + "epoch": 0.73, + "learning_rate": 8.202469046346699e-05, + "loss": 2.7765, + "step": 80500 + }, + { + "epoch": 0.73, + "learning_rate": 8.175568578079289e-05, + "loss": 2.7735, + "step": 80600 + }, + { + "epoch": 0.73, + "learning_rate": 8.148396387910187e-05, + "loss": 2.7692, + "step": 80700 + }, + { + "epoch": 0.73, + "learning_rate": 8.121224197741084e-05, + "loss": 2.7661, + "step": 80800 + }, + { + "epoch": 0.73, + "learning_rate": 8.094052007571983e-05, + "loss": 2.7714, + "step": 80900 + }, + { + "epoch": 0.73, + "learning_rate": 8.06687981740288e-05, + "loss": 2.7773, + "step": 81000 + }, + { + "epoch": 0.73, + "eval_accuracy": 0.4603297311520629, + "eval_loss": 2.786165475845337, + "eval_runtime": 45.3636, + "eval_samples_per_second": 142.912, + "eval_steps_per_second": 2.403, + "step": 81000 + }, + { + "epoch": 0.73, + "learning_rate": 8.03970762723378e-05, + "loss": 2.77, + "step": 81100 + }, + { + "epoch": 0.73, + "learning_rate": 8.012535437064678e-05, + "loss": 2.772, + "step": 81200 + }, + { + "epoch": 0.74, + "learning_rate": 7.985363246895577e-05, + "loss": 2.7751, + "step": 81300 + }, + { + "epoch": 0.74, + "learning_rate": 7.958191056726475e-05, + "loss": 2.7705, + "step": 81400 + }, + { + "epoch": 0.74, + "learning_rate": 7.931018866557374e-05, + "loss": 2.7711, + "step": 81500 + }, + { + "epoch": 0.74, + "learning_rate": 7.903846676388271e-05, + "loss": 2.7666, + "step": 81600 + }, + { + "epoch": 0.74, + "learning_rate": 7.87667448621917e-05, + "loss": 2.7678, + "step": 81700 + }, + { + "epoch": 0.74, + "learning_rate": 7.84977401795176e-05, + "loss": 2.7707, + "step": 81800 + }, + { + "epoch": 0.74, + "learning_rate": 7.822601827782658e-05, + "loss": 2.7624, + "step": 81900 + }, + { + "epoch": 0.74, + "learning_rate": 7.795429637613557e-05, + "loss": 2.7688, + "step": 82000 + }, + { + "epoch": 0.74, + "eval_accuracy": 0.4608795855223163, + "eval_loss": 2.7835707664489746, + "eval_runtime": 44.1206, + "eval_samples_per_second": 146.938, + "eval_steps_per_second": 2.47, + "step": 82000 + }, + { + "epoch": 0.74, + "learning_rate": 7.768257447444454e-05, + "loss": 2.7652, + "step": 82100 + }, + { + "epoch": 0.74, + "learning_rate": 7.741085257275354e-05, + "loss": 2.763, + "step": 82200 + }, + { + "epoch": 0.74, + "learning_rate": 7.713913067106252e-05, + "loss": 2.7718, + "step": 82300 + }, + { + "epoch": 0.74, + "learning_rate": 7.686740876937151e-05, + "loss": 2.774, + "step": 82400 + }, + { + "epoch": 0.75, + "learning_rate": 7.659568686768048e-05, + "loss": 2.7624, + "step": 82500 + }, + { + "epoch": 0.75, + "learning_rate": 7.632396496598946e-05, + "loss": 2.7672, + "step": 82600 + }, + { + "epoch": 0.75, + "learning_rate": 7.605224306429845e-05, + "loss": 2.7646, + "step": 82700 + }, + { + "epoch": 0.75, + "learning_rate": 7.578052116260744e-05, + "loss": 2.7643, + "step": 82800 + }, + { + "epoch": 0.75, + "learning_rate": 7.550879926091643e-05, + "loss": 2.7636, + "step": 82900 + }, + { + "epoch": 0.75, + "learning_rate": 7.523979457824232e-05, + "loss": 2.7658, + "step": 83000 + }, + { + "epoch": 0.75, + "eval_accuracy": 0.4610453282037788, + "eval_loss": 2.7798171043395996, + "eval_runtime": 44.7143, + "eval_samples_per_second": 144.987, + "eval_steps_per_second": 2.438, + "step": 83000 + }, + { + "epoch": 0.75, + "learning_rate": 7.49680726765513e-05, + "loss": 2.7694, + "step": 83100 + }, + { + "epoch": 0.75, + "learning_rate": 7.469635077486028e-05, + "loss": 2.7662, + "step": 83200 + }, + { + "epoch": 0.75, + "learning_rate": 7.442734609218618e-05, + "loss": 2.7624, + "step": 83300 + }, + { + "epoch": 0.75, + "learning_rate": 7.415562419049516e-05, + "loss": 2.7632, + "step": 83400 + }, + { + "epoch": 0.75, + "learning_rate": 7.388390228880415e-05, + "loss": 2.7697, + "step": 83500 + }, + { + "epoch": 0.76, + "learning_rate": 7.361218038711314e-05, + "loss": 2.7663, + "step": 83600 + }, + { + "epoch": 0.76, + "learning_rate": 7.334045848542211e-05, + "loss": 2.7623, + "step": 83700 + }, + { + "epoch": 0.76, + "learning_rate": 7.306873658373109e-05, + "loss": 2.7685, + "step": 83800 + }, + { + "epoch": 0.76, + "learning_rate": 7.279701468204009e-05, + "loss": 2.7702, + "step": 83900 + }, + { + "epoch": 0.76, + "learning_rate": 7.252529278034907e-05, + "loss": 2.7622, + "step": 84000 + }, + { + "epoch": 0.76, + "eval_accuracy": 0.4611511857558078, + "eval_loss": 2.781484603881836, + "eval_runtime": 43.3638, + "eval_samples_per_second": 149.503, + "eval_steps_per_second": 2.514, + "step": 84000 + }, + { + "epoch": 0.76, + "learning_rate": 7.225357087865804e-05, + "loss": 2.7672, + "step": 84100 + }, + { + "epoch": 0.76, + "learning_rate": 7.198184897696703e-05, + "loss": 2.7652, + "step": 84200 + }, + { + "epoch": 0.76, + "learning_rate": 7.171012707527602e-05, + "loss": 2.7671, + "step": 84300 + }, + { + "epoch": 0.76, + "learning_rate": 7.143840517358501e-05, + "loss": 2.7621, + "step": 84400 + }, + { + "epoch": 0.76, + "learning_rate": 7.11694004909109e-05, + "loss": 2.7662, + "step": 84500 + }, + { + "epoch": 0.76, + "learning_rate": 7.089767858921989e-05, + "loss": 2.7684, + "step": 84600 + }, + { + "epoch": 0.77, + "learning_rate": 7.062595668752886e-05, + "loss": 2.7662, + "step": 84700 + }, + { + "epoch": 0.77, + "learning_rate": 7.035423478583785e-05, + "loss": 2.7638, + "step": 84800 + }, + { + "epoch": 0.77, + "learning_rate": 7.008251288414684e-05, + "loss": 2.7639, + "step": 84900 + }, + { + "epoch": 0.77, + "learning_rate": 6.981079098245581e-05, + "loss": 2.7691, + "step": 85000 + }, + { + "epoch": 0.77, + "eval_accuracy": 0.46120986108464673, + "eval_loss": 2.7783455848693848, + "eval_runtime": 43.5919, + "eval_samples_per_second": 148.72, + "eval_steps_per_second": 2.5, + "step": 85000 + }, + { + "epoch": 0.77, + "learning_rate": 6.95390690807648e-05, + "loss": 2.7649, + "step": 85100 + }, + { + "epoch": 0.77, + "learning_rate": 6.926734717907379e-05, + "loss": 2.7638, + "step": 85200 + }, + { + "epoch": 0.77, + "learning_rate": 6.899562527738277e-05, + "loss": 2.7675, + "step": 85300 + }, + { + "epoch": 0.77, + "learning_rate": 6.872390337569176e-05, + "loss": 2.7657, + "step": 85400 + }, + { + "epoch": 0.77, + "learning_rate": 6.845218147400074e-05, + "loss": 2.7612, + "step": 85500 + }, + { + "epoch": 0.77, + "learning_rate": 6.818045957230972e-05, + "loss": 2.7682, + "step": 85600 + }, + { + "epoch": 0.77, + "learning_rate": 6.79087376706187e-05, + "loss": 2.7588, + "step": 85700 + }, + { + "epoch": 0.78, + "learning_rate": 6.763701576892768e-05, + "loss": 2.765, + "step": 85800 + }, + { + "epoch": 0.78, + "learning_rate": 6.736529386723667e-05, + "loss": 2.7556, + "step": 85900 + }, + { + "epoch": 0.78, + "learning_rate": 6.709357196554565e-05, + "loss": 2.7579, + "step": 86000 + }, + { + "epoch": 0.78, + "eval_accuracy": 0.4619333218402277, + "eval_loss": 2.7711987495422363, + "eval_runtime": 43.3357, + "eval_samples_per_second": 149.6, + "eval_steps_per_second": 2.515, + "step": 86000 + }, + { + "epoch": 0.78, + "learning_rate": 6.682185006385464e-05, + "loss": 2.7538, + "step": 86100 + }, + { + "epoch": 0.78, + "learning_rate": 6.655012816216363e-05, + "loss": 2.7596, + "step": 86200 + }, + { + "epoch": 0.78, + "learning_rate": 6.62784062604726e-05, + "loss": 2.7512, + "step": 86300 + }, + { + "epoch": 0.78, + "learning_rate": 6.600668435878159e-05, + "loss": 2.7559, + "step": 86400 + }, + { + "epoch": 0.78, + "learning_rate": 6.573496245709058e-05, + "loss": 2.7574, + "step": 86500 + }, + { + "epoch": 0.78, + "learning_rate": 6.546324055539957e-05, + "loss": 2.7614, + "step": 86600 + }, + { + "epoch": 0.78, + "learning_rate": 6.519423587272546e-05, + "loss": 2.7501, + "step": 86700 + }, + { + "epoch": 0.78, + "learning_rate": 6.492251397103445e-05, + "loss": 2.7488, + "step": 86800 + }, + { + "epoch": 0.79, + "learning_rate": 6.465079206934342e-05, + "loss": 2.7497, + "step": 86900 + }, + { + "epoch": 0.79, + "learning_rate": 6.437907016765241e-05, + "loss": 2.7614, + "step": 87000 + }, + { + "epoch": 0.79, + "eval_accuracy": 0.46246986840394033, + "eval_loss": 2.7673110961914062, + "eval_runtime": 43.038, + "eval_samples_per_second": 150.634, + "eval_steps_per_second": 2.533, + "step": 87000 + }, + { + "epoch": 0.79, + "learning_rate": 6.41073482659614e-05, + "loss": 2.7544, + "step": 87100 + }, + { + "epoch": 0.79, + "learning_rate": 6.383834358328728e-05, + "loss": 2.7546, + "step": 87200 + }, + { + "epoch": 0.79, + "learning_rate": 6.356662168159627e-05, + "loss": 2.7564, + "step": 87300 + }, + { + "epoch": 0.79, + "learning_rate": 6.329489977990525e-05, + "loss": 2.759, + "step": 87400 + }, + { + "epoch": 0.79, + "learning_rate": 6.302317787821423e-05, + "loss": 2.7586, + "step": 87500 + }, + { + "epoch": 0.79, + "learning_rate": 6.275145597652322e-05, + "loss": 2.7546, + "step": 87600 + }, + { + "epoch": 0.79, + "learning_rate": 6.247973407483221e-05, + "loss": 2.7548, + "step": 87700 + }, + { + "epoch": 0.79, + "learning_rate": 6.220801217314118e-05, + "loss": 2.7527, + "step": 87800 + }, + { + "epoch": 0.79, + "learning_rate": 6.193629027145017e-05, + "loss": 2.7607, + "step": 87900 + }, + { + "epoch": 0.8, + "learning_rate": 6.166456836975916e-05, + "loss": 2.7592, + "step": 88000 + }, + { + "epoch": 0.8, + "eval_accuracy": 0.46232166783109974, + "eval_loss": 2.7691469192504883, + "eval_runtime": 43.5697, + "eval_samples_per_second": 148.796, + "eval_steps_per_second": 2.502, + "step": 88000 + }, + { + "epoch": 0.8, + "learning_rate": 6.139284646806815e-05, + "loss": 2.7481, + "step": 88100 + }, + { + "epoch": 0.8, + "learning_rate": 6.112112456637712e-05, + "loss": 2.7579, + "step": 88200 + }, + { + "epoch": 0.8, + "learning_rate": 6.0849402664686106e-05, + "loss": 2.7559, + "step": 88300 + }, + { + "epoch": 0.8, + "learning_rate": 6.05776807629951e-05, + "loss": 2.7515, + "step": 88400 + }, + { + "epoch": 0.8, + "learning_rate": 6.030595886130408e-05, + "loss": 2.7524, + "step": 88500 + }, + { + "epoch": 0.8, + "learning_rate": 6.003423695961306e-05, + "loss": 2.7395, + "step": 88600 + }, + { + "epoch": 0.8, + "learning_rate": 5.976251505792205e-05, + "loss": 2.7438, + "step": 88700 + }, + { + "epoch": 0.8, + "learning_rate": 5.949079315623103e-05, + "loss": 2.7468, + "step": 88800 + }, + { + "epoch": 0.8, + "learning_rate": 5.921907125454001e-05, + "loss": 2.7423, + "step": 88900 + }, + { + "epoch": 0.8, + "learning_rate": 5.8947349352849e-05, + "loss": 2.7551, + "step": 89000 + }, + { + "epoch": 0.8, + "eval_accuracy": 0.4633808482516869, + "eval_loss": 2.760658025741577, + "eval_runtime": 43.7777, + "eval_samples_per_second": 148.089, + "eval_steps_per_second": 2.49, + "step": 89000 + }, + { + "epoch": 0.81, + "learning_rate": 5.867562745115798e-05, + "loss": 2.7352, + "step": 89100 + }, + { + "epoch": 0.81, + "learning_rate": 5.8403905549466965e-05, + "loss": 2.751, + "step": 89200 + }, + { + "epoch": 0.81, + "learning_rate": 5.813490086679286e-05, + "loss": 2.7456, + "step": 89300 + }, + { + "epoch": 0.81, + "learning_rate": 5.7863178965101844e-05, + "loss": 2.7491, + "step": 89400 + }, + { + "epoch": 0.81, + "learning_rate": 5.759145706341083e-05, + "loss": 2.7477, + "step": 89500 + }, + { + "epoch": 0.81, + "learning_rate": 5.7319735161719815e-05, + "loss": 2.7431, + "step": 89600 + }, + { + "epoch": 0.81, + "learning_rate": 5.70480132600288e-05, + "loss": 2.7406, + "step": 89700 + }, + { + "epoch": 0.81, + "learning_rate": 5.6776291358337786e-05, + "loss": 2.7444, + "step": 89800 + }, + { + "epoch": 0.81, + "learning_rate": 5.650456945664677e-05, + "loss": 2.7437, + "step": 89900 + }, + { + "epoch": 0.81, + "learning_rate": 5.623284755495574e-05, + "loss": 2.7397, + "step": 90000 + }, + { + "epoch": 0.81, + "eval_accuracy": 0.4636597072887461, + "eval_loss": 2.7578768730163574, + "eval_runtime": 43.3807, + "eval_samples_per_second": 149.444, + "eval_steps_per_second": 2.513, + "step": 90000 + }, + { + "epoch": 0.81, + "learning_rate": 5.596112565326473e-05, + "loss": 2.7456, + "step": 90100 + }, + { + "epoch": 0.82, + "learning_rate": 5.5689403751573714e-05, + "loss": 2.7393, + "step": 90200 + }, + { + "epoch": 0.82, + "learning_rate": 5.54176818498827e-05, + "loss": 2.74, + "step": 90300 + }, + { + "epoch": 0.82, + "learning_rate": 5.5145959948191685e-05, + "loss": 2.7411, + "step": 90400 + }, + { + "epoch": 0.82, + "learning_rate": 5.487695526551758e-05, + "loss": 2.747, + "step": 90500 + }, + { + "epoch": 0.82, + "learning_rate": 5.4605233363826564e-05, + "loss": 2.741, + "step": 90600 + }, + { + "epoch": 0.82, + "learning_rate": 5.433622868115246e-05, + "loss": 2.7441, + "step": 90700 + }, + { + "epoch": 0.82, + "learning_rate": 5.406450677946144e-05, + "loss": 2.7447, + "step": 90800 + }, + { + "epoch": 0.82, + "learning_rate": 5.3792784877770425e-05, + "loss": 2.7517, + "step": 90900 + }, + { + "epoch": 0.82, + "learning_rate": 5.3521062976079414e-05, + "loss": 2.7357, + "step": 91000 + }, + { + "epoch": 0.82, + "eval_accuracy": 0.4636022417605018, + "eval_loss": 2.758023738861084, + "eval_runtime": 43.2538, + "eval_samples_per_second": 149.883, + "eval_steps_per_second": 2.52, + "step": 91000 + }, + { + "epoch": 0.82, + "learning_rate": 5.3249341074388396e-05, + "loss": 2.7429, + "step": 91100 + }, + { + "epoch": 0.82, + "learning_rate": 5.297761917269738e-05, + "loss": 2.7445, + "step": 91200 + }, + { + "epoch": 0.83, + "learning_rate": 5.270589727100637e-05, + "loss": 2.7473, + "step": 91300 + }, + { + "epoch": 0.83, + "learning_rate": 5.243417536931535e-05, + "loss": 2.7404, + "step": 91400 + }, + { + "epoch": 0.83, + "learning_rate": 5.216245346762434e-05, + "loss": 2.7401, + "step": 91500 + }, + { + "epoch": 0.83, + "learning_rate": 5.189073156593331e-05, + "loss": 2.7441, + "step": 91600 + }, + { + "epoch": 0.83, + "learning_rate": 5.1619009664242295e-05, + "loss": 2.737, + "step": 91700 + }, + { + "epoch": 0.83, + "learning_rate": 5.1347287762551284e-05, + "loss": 2.7337, + "step": 91800 + }, + { + "epoch": 0.83, + "learning_rate": 5.1075565860860266e-05, + "loss": 2.7422, + "step": 91900 + }, + { + "epoch": 0.83, + "learning_rate": 5.080384395916925e-05, + "loss": 2.7452, + "step": 92000 + }, + { + "epoch": 0.83, + "eval_accuracy": 0.46426944678843307, + "eval_loss": 2.751744031906128, + "eval_runtime": 44.8905, + "eval_samples_per_second": 144.418, + "eval_steps_per_second": 2.428, + "step": 92000 + }, + { + "epoch": 0.83, + "learning_rate": 5.0532122057478237e-05, + "loss": 2.7387, + "step": 92100 + }, + { + "epoch": 0.83, + "learning_rate": 5.026311737480413e-05, + "loss": 2.7342, + "step": 92200 + }, + { + "epoch": 0.83, + "learning_rate": 4.9991395473113116e-05, + "loss": 2.7349, + "step": 92300 + }, + { + "epoch": 0.84, + "learning_rate": 4.97196735714221e-05, + "loss": 2.7388, + "step": 92400 + }, + { + "epoch": 0.84, + "learning_rate": 4.944795166973108e-05, + "loss": 2.7397, + "step": 92500 + }, + { + "epoch": 0.84, + "learning_rate": 4.917622976804007e-05, + "loss": 2.7352, + "step": 92600 + }, + { + "epoch": 0.84, + "learning_rate": 4.890450786634905e-05, + "loss": 2.7392, + "step": 92700 + }, + { + "epoch": 0.84, + "learning_rate": 4.863278596465803e-05, + "loss": 2.7419, + "step": 92800 + }, + { + "epoch": 0.84, + "learning_rate": 4.836106406296702e-05, + "loss": 2.738, + "step": 92900 + }, + { + "epoch": 0.84, + "learning_rate": 4.8089342161276004e-05, + "loss": 2.7418, + "step": 93000 + }, + { + "epoch": 0.84, + "eval_accuracy": 0.46412548051767366, + "eval_loss": 2.7533059120178223, + "eval_runtime": 43.1643, + "eval_samples_per_second": 150.193, + "eval_steps_per_second": 2.525, + "step": 93000 + }, + { + "epoch": 0.84, + "learning_rate": 4.781762025958498e-05, + "loss": 2.7372, + "step": 93100 + }, + { + "epoch": 0.84, + "learning_rate": 4.7545898357893974e-05, + "loss": 2.7369, + "step": 93200 + }, + { + "epoch": 0.84, + "learning_rate": 4.727417645620295e-05, + "loss": 2.7331, + "step": 93300 + }, + { + "epoch": 0.84, + "learning_rate": 4.700245455451193e-05, + "loss": 2.7379, + "step": 93400 + }, + { + "epoch": 0.85, + "learning_rate": 4.673073265282092e-05, + "loss": 2.7341, + "step": 93500 + }, + { + "epoch": 0.85, + "learning_rate": 4.64590107511299e-05, + "loss": 2.7359, + "step": 93600 + }, + { + "epoch": 0.85, + "learning_rate": 4.618728884943889e-05, + "loss": 2.737, + "step": 93700 + }, + { + "epoch": 0.85, + "learning_rate": 4.5915566947747873e-05, + "loss": 2.7343, + "step": 93800 + }, + { + "epoch": 0.85, + "learning_rate": 4.564656226507377e-05, + "loss": 2.7346, + "step": 93900 + }, + { + "epoch": 0.85, + "learning_rate": 4.537484036338275e-05, + "loss": 2.7379, + "step": 94000 + }, + { + "epoch": 0.85, + "eval_accuracy": 0.46473280041617143, + "eval_loss": 2.748091697692871, + "eval_runtime": 43.4169, + "eval_samples_per_second": 149.32, + "eval_steps_per_second": 2.511, + "step": 94000 + }, + { + "epoch": 0.85, + "learning_rate": 4.5103118461691735e-05, + "loss": 2.7341, + "step": 94100 + }, + { + "epoch": 0.85, + "learning_rate": 4.4831396560000724e-05, + "loss": 2.7431, + "step": 94200 + }, + { + "epoch": 0.85, + "learning_rate": 4.4559674658309706e-05, + "loss": 2.7347, + "step": 94300 + }, + { + "epoch": 0.85, + "learning_rate": 4.428795275661869e-05, + "loss": 2.7366, + "step": 94400 + }, + { + "epoch": 0.85, + "learning_rate": 4.4016230854927676e-05, + "loss": 2.7344, + "step": 94500 + }, + { + "epoch": 0.86, + "learning_rate": 4.374450895323666e-05, + "loss": 2.7382, + "step": 94600 + }, + { + "epoch": 0.86, + "learning_rate": 4.347278705154564e-05, + "loss": 2.7279, + "step": 94700 + }, + { + "epoch": 0.86, + "learning_rate": 4.320106514985463e-05, + "loss": 2.7307, + "step": 94800 + }, + { + "epoch": 0.86, + "learning_rate": 4.292934324816361e-05, + "loss": 2.7275, + "step": 94900 + }, + { + "epoch": 0.86, + "learning_rate": 4.26603385654895e-05, + "loss": 2.7308, + "step": 95000 + }, + { + "epoch": 0.86, + "eval_accuracy": 0.4653649212268588, + "eval_loss": 2.7459847927093506, + "eval_runtime": 43.1356, + "eval_samples_per_second": 150.294, + "eval_steps_per_second": 2.527, + "step": 95000 + }, + { + "epoch": 0.86, + "learning_rate": 4.2388616663798484e-05, + "loss": 2.7304, + "step": 95100 + }, + { + "epoch": 0.86, + "learning_rate": 4.211689476210747e-05, + "loss": 2.7334, + "step": 95200 + }, + { + "epoch": 0.86, + "learning_rate": 4.1845172860416455e-05, + "loss": 2.7324, + "step": 95300 + }, + { + "epoch": 0.86, + "learning_rate": 4.157345095872544e-05, + "loss": 2.7338, + "step": 95400 + }, + { + "epoch": 0.86, + "learning_rate": 4.1301729057034425e-05, + "loss": 2.7334, + "step": 95500 + }, + { + "epoch": 0.86, + "learning_rate": 4.103000715534341e-05, + "loss": 2.7323, + "step": 95600 + }, + { + "epoch": 0.87, + "learning_rate": 4.075828525365239e-05, + "loss": 2.7338, + "step": 95700 + }, + { + "epoch": 0.87, + "learning_rate": 4.048656335196138e-05, + "loss": 2.73, + "step": 95800 + }, + { + "epoch": 0.87, + "learning_rate": 4.021484145027036e-05, + "loss": 2.7367, + "step": 95900 + }, + { + "epoch": 0.87, + "learning_rate": 3.994311954857934e-05, + "loss": 2.727, + "step": 96000 + }, + { + "epoch": 0.87, + "eval_accuracy": 0.46549799929226665, + "eval_loss": 2.740849018096924, + "eval_runtime": 43.5693, + "eval_samples_per_second": 148.797, + "eval_steps_per_second": 2.502, + "step": 96000 + }, + { + "epoch": 0.87, + "learning_rate": 3.967139764688833e-05, + "loss": 2.7257, + "step": 96100 + }, + { + "epoch": 0.87, + "learning_rate": 3.939967574519731e-05, + "loss": 2.7251, + "step": 96200 + }, + { + "epoch": 0.87, + "learning_rate": 3.9127953843506295e-05, + "loss": 2.7236, + "step": 96300 + }, + { + "epoch": 0.87, + "learning_rate": 3.8856231941815284e-05, + "loss": 2.7224, + "step": 96400 + }, + { + "epoch": 0.87, + "learning_rate": 3.8584510040124266e-05, + "loss": 2.7204, + "step": 96500 + }, + { + "epoch": 0.87, + "learning_rate": 3.831278813843325e-05, + "loss": 2.7249, + "step": 96600 + }, + { + "epoch": 0.87, + "learning_rate": 3.804106623674224e-05, + "loss": 2.7214, + "step": 96700 + }, + { + "epoch": 0.88, + "learning_rate": 3.776934433505122e-05, + "loss": 2.7242, + "step": 96800 + }, + { + "epoch": 0.88, + "learning_rate": 3.74976224333602e-05, + "loss": 2.7147, + "step": 96900 + }, + { + "epoch": 0.88, + "learning_rate": 3.722861775068609e-05, + "loss": 2.7282, + "step": 97000 + }, + { + "epoch": 0.88, + "eval_accuracy": 0.4663823635269317, + "eval_loss": 2.7350597381591797, + "eval_runtime": 43.4285, + "eval_samples_per_second": 149.28, + "eval_steps_per_second": 2.51, + "step": 97000 + }, + { + "epoch": 0.88, + "learning_rate": 3.695689584899508e-05, + "loss": 2.718, + "step": 97100 + }, + { + "epoch": 0.88, + "learning_rate": 3.668517394730406e-05, + "loss": 2.7174, + "step": 97200 + }, + { + "epoch": 0.88, + "learning_rate": 3.6413452045613044e-05, + "loss": 2.7205, + "step": 97300 + }, + { + "epoch": 0.88, + "learning_rate": 3.614173014392203e-05, + "loss": 2.7195, + "step": 97400 + }, + { + "epoch": 0.88, + "learning_rate": 3.5870008242231015e-05, + "loss": 2.7172, + "step": 97500 + }, + { + "epoch": 0.88, + "learning_rate": 3.559828634054e-05, + "loss": 2.7128, + "step": 97600 + }, + { + "epoch": 0.88, + "learning_rate": 3.532656443884898e-05, + "loss": 2.7192, + "step": 97700 + }, + { + "epoch": 0.88, + "learning_rate": 3.505484253715797e-05, + "loss": 2.7191, + "step": 97800 + }, + { + "epoch": 0.89, + "learning_rate": 3.478312063546695e-05, + "loss": 2.7178, + "step": 97900 + }, + { + "epoch": 0.89, + "learning_rate": 3.451139873377593e-05, + "loss": 2.7133, + "step": 98000 + }, + { + "epoch": 0.89, + "eval_accuracy": 0.46685176615764307, + "eval_loss": 2.730079412460327, + "eval_runtime": 43.3235, + "eval_samples_per_second": 149.642, + "eval_steps_per_second": 2.516, + "step": 98000 + }, + { + "epoch": 0.89, + "learning_rate": 3.423967683208492e-05, + "loss": 2.7164, + "step": 98100 + }, + { + "epoch": 0.89, + "learning_rate": 3.39679549303939e-05, + "loss": 2.7106, + "step": 98200 + }, + { + "epoch": 0.89, + "learning_rate": 3.3696233028702885e-05, + "loss": 2.715, + "step": 98300 + }, + { + "epoch": 0.89, + "learning_rate": 3.3424511127011874e-05, + "loss": 2.7091, + "step": 98400 + }, + { + "epoch": 0.89, + "learning_rate": 3.3152789225320856e-05, + "loss": 2.7093, + "step": 98500 + }, + { + "epoch": 0.89, + "learning_rate": 3.288106732362984e-05, + "loss": 2.7116, + "step": 98600 + }, + { + "epoch": 0.89, + "learning_rate": 3.260934542193883e-05, + "loss": 2.7172, + "step": 98700 + }, + { + "epoch": 0.89, + "learning_rate": 3.233762352024781e-05, + "loss": 2.7072, + "step": 98800 + }, + { + "epoch": 0.89, + "learning_rate": 3.206590161855679e-05, + "loss": 2.7165, + "step": 98900 + }, + { + "epoch": 0.9, + "learning_rate": 3.179417971686577e-05, + "loss": 2.7136, + "step": 99000 + }, + { + "epoch": 0.9, + "eval_accuracy": 0.4673356863954899, + "eval_loss": 2.7250616550445557, + "eval_runtime": 43.1535, + "eval_samples_per_second": 150.231, + "eval_steps_per_second": 2.526, + "step": 99000 + }, + { + "epoch": 0.9, + "learning_rate": 3.152245781517476e-05, + "loss": 2.7117, + "step": 99100 + }, + { + "epoch": 0.9, + "learning_rate": 3.1250735913483744e-05, + "loss": 2.7099, + "step": 99200 + }, + { + "epoch": 0.9, + "learning_rate": 3.0979014011792726e-05, + "loss": 2.715, + "step": 99300 + }, + { + "epoch": 0.9, + "learning_rate": 3.0707292110101715e-05, + "loss": 2.7119, + "step": 99400 + }, + { + "epoch": 0.9, + "learning_rate": 3.0435570208410697e-05, + "loss": 2.7136, + "step": 99500 + }, + { + "epoch": 0.9, + "learning_rate": 3.016384830671968e-05, + "loss": 2.7069, + "step": 99600 + }, + { + "epoch": 0.9, + "learning_rate": 2.9892126405028664e-05, + "loss": 2.7092, + "step": 99700 + }, + { + "epoch": 0.9, + "learning_rate": 2.962040450333765e-05, + "loss": 2.7052, + "step": 99800 + }, + { + "epoch": 0.9, + "learning_rate": 2.934868260164663e-05, + "loss": 2.7099, + "step": 99900 + }, + { + "epoch": 0.9, + "learning_rate": 2.9076960699955617e-05, + "loss": 2.7108, + "step": 100000 + }, + { + "epoch": 0.9, + "eval_accuracy": 0.46786981335801325, + "eval_loss": 2.7208478450775146, + "eval_runtime": 43.4331, + "eval_samples_per_second": 149.264, + "eval_steps_per_second": 2.51, + "step": 100000 + }, + { + "epoch": 0.91, + "learning_rate": 2.8807956017281514e-05, + "loss": 2.7137, + "step": 100100 + }, + { + "epoch": 0.91, + "learning_rate": 2.8536234115590493e-05, + "loss": 2.7069, + "step": 100200 + }, + { + "epoch": 0.91, + "learning_rate": 2.8264512213899478e-05, + "loss": 2.698, + "step": 100300 + }, + { + "epoch": 0.91, + "learning_rate": 2.7992790312208464e-05, + "loss": 2.7027, + "step": 100400 + }, + { + "epoch": 0.91, + "learning_rate": 2.7721068410517446e-05, + "loss": 2.7062, + "step": 100500 + }, + { + "epoch": 0.91, + "learning_rate": 2.744934650882643e-05, + "loss": 2.7064, + "step": 100600 + }, + { + "epoch": 0.91, + "learning_rate": 2.718034182615232e-05, + "loss": 2.7059, + "step": 100700 + }, + { + "epoch": 0.91, + "learning_rate": 2.691133714347822e-05, + "loss": 2.7146, + "step": 100800 + }, + { + "epoch": 0.91, + "learning_rate": 2.6639615241787204e-05, + "loss": 2.7036, + "step": 100900 + }, + { + "epoch": 0.91, + "learning_rate": 2.6367893340096186e-05, + "loss": 2.7051, + "step": 101000 + }, + { + "epoch": 0.91, + "eval_accuracy": 0.46807245495761163, + "eval_loss": 2.7191717624664307, + "eval_runtime": 43.4633, + "eval_samples_per_second": 149.16, + "eval_steps_per_second": 2.508, + "step": 101000 + }, + { + "epoch": 0.91, + "learning_rate": 2.609617143840517e-05, + "loss": 2.7007, + "step": 101100 + }, + { + "epoch": 0.91, + "learning_rate": 2.5824449536714157e-05, + "loss": 2.7024, + "step": 101200 + }, + { + "epoch": 0.92, + "learning_rate": 2.555272763502314e-05, + "loss": 2.7027, + "step": 101300 + }, + { + "epoch": 0.92, + "learning_rate": 2.5281005733332124e-05, + "loss": 2.7082, + "step": 101400 + }, + { + "epoch": 0.92, + "learning_rate": 2.500928383164111e-05, + "loss": 2.7067, + "step": 101500 + }, + { + "epoch": 0.92, + "learning_rate": 2.4737561929950092e-05, + "loss": 2.7044, + "step": 101600 + }, + { + "epoch": 0.92, + "learning_rate": 2.4465840028259074e-05, + "loss": 2.705, + "step": 101700 + }, + { + "epoch": 0.92, + "learning_rate": 2.419411812656806e-05, + "loss": 2.7069, + "step": 101800 + }, + { + "epoch": 0.92, + "learning_rate": 2.3922396224877045e-05, + "loss": 2.7005, + "step": 101900 + }, + { + "epoch": 0.92, + "learning_rate": 2.3650674323186027e-05, + "loss": 2.7013, + "step": 102000 + }, + { + "epoch": 0.92, + "eval_accuracy": 0.4687317962816779, + "eval_loss": 2.7151107788085938, + "eval_runtime": 43.2863, + "eval_samples_per_second": 149.77, + "eval_steps_per_second": 2.518, + "step": 102000 + }, + { + "epoch": 0.92, + "learning_rate": 2.3378952421495012e-05, + "loss": 2.7029, + "step": 102100 + }, + { + "epoch": 0.92, + "learning_rate": 2.3107230519803998e-05, + "loss": 2.7007, + "step": 102200 + }, + { + "epoch": 0.92, + "learning_rate": 2.283550861811298e-05, + "loss": 2.7089, + "step": 102300 + }, + { + "epoch": 0.93, + "learning_rate": 2.2563786716421965e-05, + "loss": 2.7018, + "step": 102400 + }, + { + "epoch": 0.93, + "learning_rate": 2.229206481473095e-05, + "loss": 2.6984, + "step": 102500 + }, + { + "epoch": 0.93, + "learning_rate": 2.202306013205684e-05, + "loss": 2.7011, + "step": 102600 + }, + { + "epoch": 0.93, + "learning_rate": 2.1751338230365826e-05, + "loss": 2.6968, + "step": 102700 + }, + { + "epoch": 0.93, + "learning_rate": 2.1479616328674812e-05, + "loss": 2.701, + "step": 102800 + }, + { + "epoch": 0.93, + "learning_rate": 2.1207894426983794e-05, + "loss": 2.7079, + "step": 102900 + }, + { + "epoch": 0.93, + "learning_rate": 2.093617252529278e-05, + "loss": 2.6996, + "step": 103000 + }, + { + "epoch": 0.93, + "eval_accuracy": 0.46891387127116774, + "eval_loss": 2.7129361629486084, + "eval_runtime": 43.7353, + "eval_samples_per_second": 148.233, + "eval_steps_per_second": 2.492, + "step": 103000 + }, + { + "epoch": 0.93, + "learning_rate": 2.0664450623601765e-05, + "loss": 2.6985, + "step": 103100 + }, + { + "epoch": 0.93, + "learning_rate": 2.0392728721910743e-05, + "loss": 2.6945, + "step": 103200 + }, + { + "epoch": 0.93, + "learning_rate": 2.012100682021973e-05, + "loss": 2.6988, + "step": 103300 + }, + { + "epoch": 0.93, + "learning_rate": 1.9849284918528714e-05, + "loss": 2.701, + "step": 103400 + }, + { + "epoch": 0.94, + "learning_rate": 1.9577563016837696e-05, + "loss": 2.7044, + "step": 103500 + }, + { + "epoch": 0.94, + "learning_rate": 1.930584111514668e-05, + "loss": 2.6897, + "step": 103600 + }, + { + "epoch": 0.94, + "learning_rate": 1.9034119213455667e-05, + "loss": 2.6993, + "step": 103700 + }, + { + "epoch": 0.94, + "learning_rate": 1.8762397311764652e-05, + "loss": 2.6978, + "step": 103800 + }, + { + "epoch": 0.94, + "learning_rate": 1.8490675410073634e-05, + "loss": 2.6965, + "step": 103900 + }, + { + "epoch": 0.94, + "learning_rate": 1.821895350838262e-05, + "loss": 2.6898, + "step": 104000 + }, + { + "epoch": 0.94, + "eval_accuracy": 0.46940021111020375, + "eval_loss": 2.7084131240844727, + "eval_runtime": 44.0036, + "eval_samples_per_second": 147.329, + "eval_steps_per_second": 2.477, + "step": 104000 + }, + { + "epoch": 0.94, + "learning_rate": 1.7947231606691602e-05, + "loss": 2.6918, + "step": 104100 + }, + { + "epoch": 0.94, + "learning_rate": 1.7675509705000587e-05, + "loss": 2.6941, + "step": 104200 + }, + { + "epoch": 0.94, + "learning_rate": 1.7403787803309573e-05, + "loss": 2.6954, + "step": 104300 + }, + { + "epoch": 0.94, + "learning_rate": 1.7132065901618555e-05, + "loss": 2.7015, + "step": 104400 + }, + { + "epoch": 0.94, + "learning_rate": 1.686034399992754e-05, + "loss": 2.698, + "step": 104500 + }, + { + "epoch": 0.95, + "learning_rate": 1.6588622098236522e-05, + "loss": 2.6922, + "step": 104600 + }, + { + "epoch": 0.95, + "learning_rate": 1.6319617415562416e-05, + "loss": 2.6932, + "step": 104700 + }, + { + "epoch": 0.95, + "learning_rate": 1.60478955138714e-05, + "loss": 2.6887, + "step": 104800 + }, + { + "epoch": 0.95, + "learning_rate": 1.5776173612180387e-05, + "loss": 2.6887, + "step": 104900 + }, + { + "epoch": 0.95, + "learning_rate": 1.550445171048937e-05, + "loss": 2.688, + "step": 105000 + }, + { + "epoch": 0.95, + "eval_accuracy": 0.4697316964731288, + "eval_loss": 2.705327272415161, + "eval_runtime": 43.7246, + "eval_samples_per_second": 148.269, + "eval_steps_per_second": 2.493, + "step": 105000 + }, + { + "epoch": 0.95, + "learning_rate": 1.5232729808798354e-05, + "loss": 2.6933, + "step": 105100 + }, + { + "epoch": 0.95, + "learning_rate": 1.4961007907107338e-05, + "loss": 2.6992, + "step": 105200 + }, + { + "epoch": 0.95, + "learning_rate": 1.468928600541632e-05, + "loss": 2.6943, + "step": 105300 + }, + { + "epoch": 0.95, + "learning_rate": 1.4417564103725306e-05, + "loss": 2.6919, + "step": 105400 + }, + { + "epoch": 0.95, + "learning_rate": 1.414584220203429e-05, + "loss": 2.6961, + "step": 105500 + }, + { + "epoch": 0.95, + "learning_rate": 1.3874120300343275e-05, + "loss": 2.6942, + "step": 105600 + }, + { + "epoch": 0.96, + "learning_rate": 1.3602398398652258e-05, + "loss": 2.6936, + "step": 105700 + }, + { + "epoch": 0.96, + "learning_rate": 1.3330676496961242e-05, + "loss": 2.6851, + "step": 105800 + }, + { + "epoch": 0.96, + "learning_rate": 1.3058954595270228e-05, + "loss": 2.6929, + "step": 105900 + }, + { + "epoch": 0.96, + "learning_rate": 1.278723269357921e-05, + "loss": 2.6855, + "step": 106000 + }, + { + "epoch": 0.96, + "eval_accuracy": 0.4701273012675686, + "eval_loss": 2.701770305633545, + "eval_runtime": 44.1379, + "eval_samples_per_second": 146.881, + "eval_steps_per_second": 2.47, + "step": 106000 + }, + { + "epoch": 0.96, + "learning_rate": 1.2515510791888195e-05, + "loss": 2.6922, + "step": 106100 + }, + { + "epoch": 0.96, + "learning_rate": 1.2243788890197179e-05, + "loss": 2.6811, + "step": 106200 + }, + { + "epoch": 0.96, + "learning_rate": 1.1972066988506163e-05, + "loss": 2.6819, + "step": 106300 + }, + { + "epoch": 0.96, + "learning_rate": 1.1700345086815148e-05, + "loss": 2.6882, + "step": 106400 + }, + { + "epoch": 0.96, + "learning_rate": 1.142862318512413e-05, + "loss": 2.685, + "step": 106500 + }, + { + "epoch": 0.96, + "learning_rate": 1.1159618502450025e-05, + "loss": 2.6841, + "step": 106600 + }, + { + "epoch": 0.96, + "learning_rate": 1.0887896600759008e-05, + "loss": 2.6806, + "step": 106700 + }, + { + "epoch": 0.97, + "learning_rate": 1.0616174699067993e-05, + "loss": 2.6896, + "step": 106800 + }, + { + "epoch": 0.97, + "learning_rate": 1.0344452797376977e-05, + "loss": 2.6807, + "step": 106900 + }, + { + "epoch": 0.97, + "learning_rate": 1.0072730895685962e-05, + "loss": 2.6852, + "step": 107000 + }, + { + "epoch": 0.97, + "eval_accuracy": 0.4704999198507106, + "eval_loss": 2.698939085006714, + "eval_runtime": 43.9086, + "eval_samples_per_second": 147.648, + "eval_steps_per_second": 2.482, + "step": 107000 + }, + { + "epoch": 0.97, + "learning_rate": 9.803726213011856e-06, + "loss": 2.6861, + "step": 107100 + }, + { + "epoch": 0.97, + "learning_rate": 9.53200431132084e-06, + "loss": 2.6886, + "step": 107200 + }, + { + "epoch": 0.97, + "learning_rate": 9.260282409629823e-06, + "loss": 2.6872, + "step": 107300 + }, + { + "epoch": 0.97, + "learning_rate": 8.988560507938807e-06, + "loss": 2.685, + "step": 107400 + }, + { + "epoch": 0.97, + "learning_rate": 8.71683860624779e-06, + "loss": 2.6892, + "step": 107500 + }, + { + "epoch": 0.97, + "learning_rate": 8.445116704556776e-06, + "loss": 2.6815, + "step": 107600 + }, + { + "epoch": 0.97, + "learning_rate": 8.17339480286576e-06, + "loss": 2.6879, + "step": 107700 + }, + { + "epoch": 0.97, + "learning_rate": 7.901672901174744e-06, + "loss": 2.6822, + "step": 107800 + }, + { + "epoch": 0.98, + "learning_rate": 7.629950999483727e-06, + "loss": 2.6806, + "step": 107900 + }, + { + "epoch": 0.98, + "learning_rate": 7.360946316809621e-06, + "loss": 2.689, + "step": 108000 + }, + { + "epoch": 0.98, + "eval_accuracy": 0.4705204864608191, + "eval_loss": 2.6981818675994873, + "eval_runtime": 43.1633, + "eval_samples_per_second": 150.197, + "eval_steps_per_second": 2.525, + "step": 108000 + }, + { + "epoch": 0.98, + "learning_rate": 7.089224415118606e-06, + "loss": 2.6872, + "step": 108100 + }, + { + "epoch": 0.98, + "learning_rate": 6.81750251342759e-06, + "loss": 2.6962, + "step": 108200 + }, + { + "epoch": 0.98, + "learning_rate": 6.545780611736574e-06, + "loss": 2.6831, + "step": 108300 + }, + { + "epoch": 0.98, + "learning_rate": 6.274058710045559e-06, + "loss": 2.6877, + "step": 108400 + }, + { + "epoch": 0.98, + "learning_rate": 6.0023368083545415e-06, + "loss": 2.6956, + "step": 108500 + }, + { + "epoch": 0.98, + "learning_rate": 5.730614906663526e-06, + "loss": 2.6936, + "step": 108600 + }, + { + "epoch": 0.98, + "learning_rate": 5.458893004972511e-06, + "loss": 2.6864, + "step": 108700 + }, + { + "epoch": 0.98, + "learning_rate": 5.187171103281495e-06, + "loss": 2.6838, + "step": 108800 + }, + { + "epoch": 0.98, + "learning_rate": 4.915449201590478e-06, + "loss": 2.6867, + "step": 108900 + }, + { + "epoch": 0.99, + "learning_rate": 4.643727299899463e-06, + "loss": 2.6868, + "step": 109000 + }, + { + "epoch": 0.99, + "eval_accuracy": 0.4707297819636878, + "eval_loss": 2.6994001865386963, + "eval_runtime": 43.0302, + "eval_samples_per_second": 150.662, + "eval_steps_per_second": 2.533, + "step": 109000 + }, + { + "epoch": 0.99, + "learning_rate": 4.3720053982084465e-06, + "loss": 2.689, + "step": 109100 + }, + { + "epoch": 0.99, + "learning_rate": 4.10028349651743e-06, + "loss": 2.6831, + "step": 109200 + }, + { + "epoch": 0.99, + "learning_rate": 3.831278813843325e-06, + "loss": 2.6825, + "step": 109300 + }, + { + "epoch": 0.99, + "learning_rate": 3.559556912152309e-06, + "loss": 2.6851, + "step": 109400 + }, + { + "epoch": 0.99, + "learning_rate": 3.2878350104612927e-06, + "loss": 2.6798, + "step": 109500 + }, + { + "epoch": 0.99, + "learning_rate": 3.016113108770277e-06, + "loss": 2.6773, + "step": 109600 + }, + { + "epoch": 0.99, + "learning_rate": 2.744391207079261e-06, + "loss": 2.6829, + "step": 109700 + }, + { + "epoch": 0.99, + "learning_rate": 2.472669305388245e-06, + "loss": 2.6819, + "step": 109800 + }, + { + "epoch": 0.99, + "learning_rate": 2.2036646227141394e-06, + "loss": 2.6827, + "step": 109900 + }, + { + "epoch": 0.99, + "learning_rate": 1.931942721023123e-06, + "loss": 2.6901, + "step": 110000 + }, + { + "epoch": 0.99, + "eval_accuracy": 0.47069106834466007, + "eval_loss": 2.700648307800293, + "eval_runtime": 43.0535, + "eval_samples_per_second": 150.58, + "eval_steps_per_second": 2.532, + "step": 110000 + }, + { + "epoch": 1.0, + "learning_rate": 1.6602208193321073e-06, + "loss": 2.6809, + "step": 110100 + }, + { + "epoch": 1.0, + "learning_rate": 1.3884989176410914e-06, + "loss": 2.6866, + "step": 110200 + }, + { + "epoch": 1.0, + "learning_rate": 1.1167770159500756e-06, + "loss": 2.6863, + "step": 110300 + }, + { + "epoch": 1.0, + "learning_rate": 8.450551142590596e-07, + "loss": 2.6912, + "step": 110400 + }, + { + "epoch": 1.0, + "learning_rate": 5.733332125680437e-07, + "loss": 2.6916, + "step": 110500 + }, + { + "epoch": 1.0, + "learning_rate": 3.0161131087702765e-07, + "loss": 2.684, + "step": 110600 + }, + { + "epoch": 1.0, + "step": 110607, + "total_flos": 2.899312376933253e+20, + "train_loss": 2.8584754099769967, + "train_runtime": 318077.2613, + "train_samples_per_second": 83.457, + "train_steps_per_second": 0.348 + } + ], + "logging_steps": 100, + "max_steps": 110607, + "num_train_epochs": 1, + "save_steps": 11061, + "total_flos": 2.899312376933253e+20, + "trial_name": null, + "trial_params": null +}