|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.96398891966759, |
|
"eval_steps": 500, |
|
"global_step": 180, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01662049861495845, |
|
"grad_norm": 1.9652302265167236, |
|
"learning_rate": 0.0, |
|
"loss": 0.7152, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0332409972299169, |
|
"grad_norm": 2.135629177093506, |
|
"learning_rate": 5.555555555555555e-07, |
|
"loss": 0.7024, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.04986149584487535, |
|
"grad_norm": 2.365844964981079, |
|
"learning_rate": 1.111111111111111e-06, |
|
"loss": 0.7755, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0664819944598338, |
|
"grad_norm": 1.939900517463684, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.7134, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.08310249307479224, |
|
"grad_norm": 1.8507870435714722, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 0.6644, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0997229916897507, |
|
"grad_norm": 1.8390847444534302, |
|
"learning_rate": 2.7777777777777783e-06, |
|
"loss": 0.7306, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.11634349030470914, |
|
"grad_norm": 1.2149966955184937, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.5377, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1329639889196676, |
|
"grad_norm": 1.203329086303711, |
|
"learning_rate": 3.88888888888889e-06, |
|
"loss": 0.6448, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.14958448753462603, |
|
"grad_norm": 1.1259090900421143, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 0.6041, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.16620498614958448, |
|
"grad_norm": 0.9785488247871399, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6802, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.18282548476454294, |
|
"grad_norm": 0.7702904343605042, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 0.5737, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.1994459833795014, |
|
"grad_norm": 0.7972448468208313, |
|
"learning_rate": 6.111111111111112e-06, |
|
"loss": 0.6071, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.21606648199445982, |
|
"grad_norm": 0.8643639087677002, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.5645, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.23268698060941828, |
|
"grad_norm": 0.822340190410614, |
|
"learning_rate": 7.222222222222223e-06, |
|
"loss": 0.5512, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.24930747922437674, |
|
"grad_norm": 1.0604660511016846, |
|
"learning_rate": 7.77777777777778e-06, |
|
"loss": 0.5875, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2659279778393352, |
|
"grad_norm": 0.8126739263534546, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.5601, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.28254847645429365, |
|
"grad_norm": 0.7240079641342163, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 0.5724, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.29916897506925205, |
|
"grad_norm": 0.6566236615180969, |
|
"learning_rate": 9.444444444444445e-06, |
|
"loss": 0.5535, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.3157894736842105, |
|
"grad_norm": 0.7229272723197937, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5413, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.33240997229916897, |
|
"grad_norm": 0.6160261034965515, |
|
"learning_rate": 9.999059852242508e-06, |
|
"loss": 0.4809, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3490304709141274, |
|
"grad_norm": 0.5426657199859619, |
|
"learning_rate": 9.996239762521152e-06, |
|
"loss": 0.4453, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.3656509695290859, |
|
"grad_norm": 0.6986624002456665, |
|
"learning_rate": 9.991540791356342e-06, |
|
"loss": 0.5704, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.38227146814404434, |
|
"grad_norm": 0.6466948986053467, |
|
"learning_rate": 9.98496470583896e-06, |
|
"loss": 0.5222, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.3988919667590028, |
|
"grad_norm": 0.5881003141403198, |
|
"learning_rate": 9.976513978965829e-06, |
|
"loss": 0.4903, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4155124653739612, |
|
"grad_norm": 0.5835773348808289, |
|
"learning_rate": 9.966191788709716e-06, |
|
"loss": 0.4936, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.43213296398891965, |
|
"grad_norm": 0.5974717736244202, |
|
"learning_rate": 9.954002016824226e-06, |
|
"loss": 0.544, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.4487534626038781, |
|
"grad_norm": 0.6126233339309692, |
|
"learning_rate": 9.939949247384046e-06, |
|
"loss": 0.5313, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.46537396121883656, |
|
"grad_norm": 0.5605891942977905, |
|
"learning_rate": 9.924038765061042e-06, |
|
"loss": 0.5121, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.481994459833795, |
|
"grad_norm": 0.523395299911499, |
|
"learning_rate": 9.906276553136924e-06, |
|
"loss": 0.4705, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.4986149584487535, |
|
"grad_norm": 0.5597982406616211, |
|
"learning_rate": 9.886669291253178e-06, |
|
"loss": 0.4951, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5152354570637119, |
|
"grad_norm": 0.5273374915122986, |
|
"learning_rate": 9.86522435289912e-06, |
|
"loss": 0.4763, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.5318559556786704, |
|
"grad_norm": 0.5255304574966431, |
|
"learning_rate": 9.841949802639031e-06, |
|
"loss": 0.5133, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5484764542936288, |
|
"grad_norm": 0.8223831057548523, |
|
"learning_rate": 9.816854393079402e-06, |
|
"loss": 0.4865, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5650969529085873, |
|
"grad_norm": 0.4619203805923462, |
|
"learning_rate": 9.789947561577445e-06, |
|
"loss": 0.4631, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5817174515235457, |
|
"grad_norm": 0.4974648654460907, |
|
"learning_rate": 9.761239426692077e-06, |
|
"loss": 0.5039, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5983379501385041, |
|
"grad_norm": 0.5178198218345642, |
|
"learning_rate": 9.730740784378755e-06, |
|
"loss": 0.4618, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.6149584487534626, |
|
"grad_norm": 0.5592218637466431, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 0.4777, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.631578947368421, |
|
"grad_norm": 0.4956098198890686, |
|
"learning_rate": 9.664418523660004e-06, |
|
"loss": 0.4925, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.6481994459833795, |
|
"grad_norm": 0.48805150389671326, |
|
"learning_rate": 9.628619846344453e-06, |
|
"loss": 0.4423, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.6648199445983379, |
|
"grad_norm": 0.5749639868736267, |
|
"learning_rate": 9.591080534401371e-06, |
|
"loss": 0.55, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6814404432132964, |
|
"grad_norm": 0.7393980622291565, |
|
"learning_rate": 9.551814704830734e-06, |
|
"loss": 0.426, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6980609418282548, |
|
"grad_norm": 0.5011327862739563, |
|
"learning_rate": 9.51083712390519e-06, |
|
"loss": 0.4628, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.7146814404432132, |
|
"grad_norm": 0.572926938533783, |
|
"learning_rate": 9.468163201617063e-06, |
|
"loss": 0.527, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.7313019390581718, |
|
"grad_norm": 0.5243227481842041, |
|
"learning_rate": 9.423808985883289e-06, |
|
"loss": 0.5115, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.7479224376731302, |
|
"grad_norm": 0.5271593928337097, |
|
"learning_rate": 9.377791156510456e-06, |
|
"loss": 0.4921, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7645429362880887, |
|
"grad_norm": 0.5143831968307495, |
|
"learning_rate": 9.330127018922195e-06, |
|
"loss": 0.4842, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.7811634349030471, |
|
"grad_norm": 0.5135733485221863, |
|
"learning_rate": 9.280834497651334e-06, |
|
"loss": 0.4939, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7977839335180056, |
|
"grad_norm": 0.5173041820526123, |
|
"learning_rate": 9.229932129599206e-06, |
|
"loss": 0.4819, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.814404432132964, |
|
"grad_norm": 0.570851743221283, |
|
"learning_rate": 9.177439057064684e-06, |
|
"loss": 0.5439, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.8310249307479224, |
|
"grad_norm": 0.552671492099762, |
|
"learning_rate": 9.123375020545534e-06, |
|
"loss": 0.4669, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8476454293628809, |
|
"grad_norm": 0.5668032765388489, |
|
"learning_rate": 9.067760351314838e-06, |
|
"loss": 0.5138, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.8642659279778393, |
|
"grad_norm": 0.48532989621162415, |
|
"learning_rate": 9.01061596377522e-06, |
|
"loss": 0.4827, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.8808864265927978, |
|
"grad_norm": 0.4953126311302185, |
|
"learning_rate": 8.951963347593797e-06, |
|
"loss": 0.4273, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.8975069252077562, |
|
"grad_norm": 0.5042351484298706, |
|
"learning_rate": 8.891824559620801e-06, |
|
"loss": 0.5311, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.9141274238227147, |
|
"grad_norm": 0.532244086265564, |
|
"learning_rate": 8.83022221559489e-06, |
|
"loss": 0.5364, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.9307479224376731, |
|
"grad_norm": 0.5507211089134216, |
|
"learning_rate": 8.767179481638303e-06, |
|
"loss": 0.5264, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.9473684210526315, |
|
"grad_norm": 0.5117627382278442, |
|
"learning_rate": 8.702720065545024e-06, |
|
"loss": 0.4994, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.96398891966759, |
|
"grad_norm": 0.6424684524536133, |
|
"learning_rate": 8.636868207865244e-06, |
|
"loss": 0.5321, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.9806094182825484, |
|
"grad_norm": 0.5632804036140442, |
|
"learning_rate": 8.569648672789496e-06, |
|
"loss": 0.5354, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.997229916897507, |
|
"grad_norm": 0.5519580841064453, |
|
"learning_rate": 8.501086738835843e-06, |
|
"loss": 0.5502, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5519580841064453, |
|
"learning_rate": 8.43120818934367e-06, |
|
"loss": 0.4298, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.0166204986149585, |
|
"grad_norm": 1.4024403095245361, |
|
"learning_rate": 8.360039302777614e-06, |
|
"loss": 0.3848, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.0332409972299168, |
|
"grad_norm": 0.4745033085346222, |
|
"learning_rate": 8.28760684284532e-06, |
|
"loss": 0.4, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.0498614958448753, |
|
"grad_norm": 0.5079669952392578, |
|
"learning_rate": 8.213938048432697e-06, |
|
"loss": 0.3824, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.0664819944598338, |
|
"grad_norm": 0.49697190523147583, |
|
"learning_rate": 8.139060623360494e-06, |
|
"loss": 0.4243, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.0831024930747923, |
|
"grad_norm": 0.4616394639015198, |
|
"learning_rate": 8.063002725966014e-06, |
|
"loss": 0.3888, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.0997229916897506, |
|
"grad_norm": 0.4260391294956207, |
|
"learning_rate": 7.985792958513932e-06, |
|
"loss": 0.3406, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.1163434903047091, |
|
"grad_norm": 0.47153493762016296, |
|
"learning_rate": 7.907460356440133e-06, |
|
"loss": 0.3636, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.1329639889196677, |
|
"grad_norm": 0.5076174139976501, |
|
"learning_rate": 7.828034377432694e-06, |
|
"loss": 0.4166, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.149584487534626, |
|
"grad_norm": 0.5310080647468567, |
|
"learning_rate": 7.747544890354031e-06, |
|
"loss": 0.4311, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.1662049861495845, |
|
"grad_norm": 0.5010002851486206, |
|
"learning_rate": 7.666022164008458e-06, |
|
"loss": 0.3193, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.182825484764543, |
|
"grad_norm": 0.49259936809539795, |
|
"learning_rate": 7.5834968557593155e-06, |
|
"loss": 0.3456, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.1994459833795015, |
|
"grad_norm": 0.5213885307312012, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.3615, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.2160664819944598, |
|
"grad_norm": 0.512752115726471, |
|
"learning_rate": 7.415562996483193e-06, |
|
"loss": 0.3569, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.2326869806094183, |
|
"grad_norm": 0.5139035582542419, |
|
"learning_rate": 7.330217598512696e-06, |
|
"loss": 0.3859, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.2493074792243768, |
|
"grad_norm": 0.5561084151268005, |
|
"learning_rate": 7.243995901002312e-06, |
|
"loss": 0.363, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.2659279778393353, |
|
"grad_norm": 0.49844229221343994, |
|
"learning_rate": 7.156930328406268e-06, |
|
"loss": 0.3648, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.2825484764542936, |
|
"grad_norm": 0.5111745595932007, |
|
"learning_rate": 7.069053622525697e-06, |
|
"loss": 0.3453, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.299168975069252, |
|
"grad_norm": 0.5968831777572632, |
|
"learning_rate": 6.980398830195785e-06, |
|
"loss": 0.3601, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.3157894736842106, |
|
"grad_norm": 0.3998188376426697, |
|
"learning_rate": 6.890999290858213e-06, |
|
"loss": 0.2965, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.332409972299169, |
|
"grad_norm": 0.5044348239898682, |
|
"learning_rate": 6.800888624023552e-06, |
|
"loss": 0.3579, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.3490304709141274, |
|
"grad_norm": 0.499636709690094, |
|
"learning_rate": 6.710100716628345e-06, |
|
"loss": 0.3751, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.365650969529086, |
|
"grad_norm": 0.5045871734619141, |
|
"learning_rate": 6.618669710291607e-06, |
|
"loss": 0.3782, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.3822714681440442, |
|
"grad_norm": 0.5296726822853088, |
|
"learning_rate": 6.526629988475567e-06, |
|
"loss": 0.413, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.3988919667590027, |
|
"grad_norm": 0.5541542768478394, |
|
"learning_rate": 6.434016163555452e-06, |
|
"loss": 0.4176, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.4155124653739612, |
|
"grad_norm": 0.52264803647995, |
|
"learning_rate": 6.340863063803187e-06, |
|
"loss": 0.3687, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.4321329639889195, |
|
"grad_norm": 0.5726013779640198, |
|
"learning_rate": 6.247205720289907e-06, |
|
"loss": 0.4127, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.448753462603878, |
|
"grad_norm": 0.5129911303520203, |
|
"learning_rate": 6.153079353712201e-06, |
|
"loss": 0.3608, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.4653739612188366, |
|
"grad_norm": 0.5869404673576355, |
|
"learning_rate": 6.058519361147055e-06, |
|
"loss": 0.369, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.481994459833795, |
|
"grad_norm": 0.4603992998600006, |
|
"learning_rate": 5.9635613027404495e-06, |
|
"loss": 0.2792, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.4986149584487536, |
|
"grad_norm": 0.433829128742218, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 0.2935, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.5152354570637119, |
|
"grad_norm": 0.4892548620700836, |
|
"learning_rate": 5.772593964039203e-06, |
|
"loss": 0.3591, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.5318559556786704, |
|
"grad_norm": 0.4414325952529907, |
|
"learning_rate": 5.6766564987506564e-06, |
|
"loss": 0.3312, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.548476454293629, |
|
"grad_norm": 0.5104185938835144, |
|
"learning_rate": 5.5804645706261515e-06, |
|
"loss": 0.3524, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.5650969529085872, |
|
"grad_norm": 0.46491438150405884, |
|
"learning_rate": 5.484054353515896e-06, |
|
"loss": 0.3127, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.5817174515235457, |
|
"grad_norm": 0.5037529468536377, |
|
"learning_rate": 5.387462103359655e-06, |
|
"loss": 0.3549, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.5983379501385042, |
|
"grad_norm": 0.456927090883255, |
|
"learning_rate": 5.290724144552379e-06, |
|
"loss": 0.3583, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.6149584487534625, |
|
"grad_norm": 0.48146891593933105, |
|
"learning_rate": 5.193876856284085e-06, |
|
"loss": 0.3485, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.631578947368421, |
|
"grad_norm": 0.45695117115974426, |
|
"learning_rate": 5.096956658859122e-06, |
|
"loss": 0.3325, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.6481994459833795, |
|
"grad_norm": 0.46289077401161194, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3461, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.6648199445983378, |
|
"grad_norm": 0.5340746641159058, |
|
"learning_rate": 4.903043341140879e-06, |
|
"loss": 0.3856, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.6814404432132966, |
|
"grad_norm": 0.433956503868103, |
|
"learning_rate": 4.806123143715916e-06, |
|
"loss": 0.3166, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.6980609418282548, |
|
"grad_norm": 0.4446304440498352, |
|
"learning_rate": 4.7092758554476215e-06, |
|
"loss": 0.3378, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.7146814404432131, |
|
"grad_norm": 0.5027093291282654, |
|
"learning_rate": 4.6125378966403465e-06, |
|
"loss": 0.3915, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.7313019390581719, |
|
"grad_norm": 0.5546647310256958, |
|
"learning_rate": 4.515945646484105e-06, |
|
"loss": 0.3484, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.7479224376731302, |
|
"grad_norm": 0.49674123525619507, |
|
"learning_rate": 4.4195354293738484e-06, |
|
"loss": 0.3501, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.7645429362880887, |
|
"grad_norm": 0.5134773850440979, |
|
"learning_rate": 4.323343501249346e-06, |
|
"loss": 0.3818, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.7811634349030472, |
|
"grad_norm": 0.5111790299415588, |
|
"learning_rate": 4.227406035960798e-06, |
|
"loss": 0.4027, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.7977839335180055, |
|
"grad_norm": 0.5103554129600525, |
|
"learning_rate": 4.131759111665349e-06, |
|
"loss": 0.3295, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.814404432132964, |
|
"grad_norm": 0.48488280177116394, |
|
"learning_rate": 4.036438697259551e-06, |
|
"loss": 0.3339, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.8310249307479225, |
|
"grad_norm": 0.4840296506881714, |
|
"learning_rate": 3.941480638852948e-06, |
|
"loss": 0.3519, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.8476454293628808, |
|
"grad_norm": 0.4919949471950531, |
|
"learning_rate": 3.8469206462878e-06, |
|
"loss": 0.328, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.8642659279778393, |
|
"grad_norm": 0.5291365385055542, |
|
"learning_rate": 3.752794279710094e-06, |
|
"loss": 0.3753, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.8808864265927978, |
|
"grad_norm": 0.4807715117931366, |
|
"learning_rate": 3.6591369361968127e-06, |
|
"loss": 0.393, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.897506925207756, |
|
"grad_norm": 0.4700012803077698, |
|
"learning_rate": 3.5659838364445505e-06, |
|
"loss": 0.3182, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.9141274238227148, |
|
"grad_norm": 1.0692706108093262, |
|
"learning_rate": 3.473370011524435e-06, |
|
"loss": 0.3463, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.9307479224376731, |
|
"grad_norm": 0.49183958768844604, |
|
"learning_rate": 3.3813302897083955e-06, |
|
"loss": 0.3694, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.9473684210526314, |
|
"grad_norm": 0.5577133893966675, |
|
"learning_rate": 3.289899283371657e-06, |
|
"loss": 0.3693, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.9639889196675901, |
|
"grad_norm": 0.47118237614631653, |
|
"learning_rate": 3.1991113759764493e-06, |
|
"loss": 0.3325, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.9806094182825484, |
|
"grad_norm": 0.44954901933670044, |
|
"learning_rate": 3.1090007091417884e-06, |
|
"loss": 0.3497, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.997229916897507, |
|
"grad_norm": 0.5316449403762817, |
|
"learning_rate": 3.019601169804216e-06, |
|
"loss": 0.4239, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.5316449403762817, |
|
"learning_rate": 2.9309463774743047e-06, |
|
"loss": 0.302, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.0166204986149583, |
|
"grad_norm": 1.3086326122283936, |
|
"learning_rate": 2.843069671593734e-06, |
|
"loss": 0.2255, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.033240997229917, |
|
"grad_norm": 0.4746488928794861, |
|
"learning_rate": 2.7560040989976894e-06, |
|
"loss": 0.2275, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.0498614958448753, |
|
"grad_norm": 0.4944143295288086, |
|
"learning_rate": 2.6697824014873076e-06, |
|
"loss": 0.2648, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.0664819944598336, |
|
"grad_norm": 0.5195774435997009, |
|
"learning_rate": 2.5844370035168077e-06, |
|
"loss": 0.2707, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.0831024930747923, |
|
"grad_norm": 0.885553240776062, |
|
"learning_rate": 2.5000000000000015e-06, |
|
"loss": 0.2764, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.0997229916897506, |
|
"grad_norm": 0.5028234124183655, |
|
"learning_rate": 2.4165031442406857e-06, |
|
"loss": 0.2503, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.1163434903047094, |
|
"grad_norm": 0.4780957102775574, |
|
"learning_rate": 2.333977835991545e-06, |
|
"loss": 0.2406, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.1329639889196677, |
|
"grad_norm": 0.46052825450897217, |
|
"learning_rate": 2.2524551096459703e-06, |
|
"loss": 0.2155, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.149584487534626, |
|
"grad_norm": 0.6180452704429626, |
|
"learning_rate": 2.171965622567308e-06, |
|
"loss": 0.2787, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.1662049861495847, |
|
"grad_norm": 0.6939100027084351, |
|
"learning_rate": 2.0925396435598665e-06, |
|
"loss": 0.246, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.182825484764543, |
|
"grad_norm": 0.6042692065238953, |
|
"learning_rate": 2.0142070414860704e-06, |
|
"loss": 0.2609, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.1994459833795013, |
|
"grad_norm": 0.7851183414459229, |
|
"learning_rate": 1.936997274033986e-06, |
|
"loss": 0.2876, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.21606648199446, |
|
"grad_norm": 0.5801565051078796, |
|
"learning_rate": 1.8609393766395083e-06, |
|
"loss": 0.288, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.2326869806094183, |
|
"grad_norm": 0.5398533940315247, |
|
"learning_rate": 1.7860619515673034e-06, |
|
"loss": 0.2958, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.2493074792243766, |
|
"grad_norm": 0.48142921924591064, |
|
"learning_rate": 1.7123931571546826e-06, |
|
"loss": 0.2506, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.2659279778393353, |
|
"grad_norm": 0.48484477400779724, |
|
"learning_rate": 1.639960697222388e-06, |
|
"loss": 0.2166, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.2825484764542936, |
|
"grad_norm": 0.4676513075828552, |
|
"learning_rate": 1.5687918106563326e-06, |
|
"loss": 0.2558, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.299168975069252, |
|
"grad_norm": 0.5008206963539124, |
|
"learning_rate": 1.4989132611641576e-06, |
|
"loss": 0.2315, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.3157894736842106, |
|
"grad_norm": 0.5055615901947021, |
|
"learning_rate": 1.4303513272105057e-06, |
|
"loss": 0.278, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.332409972299169, |
|
"grad_norm": 0.5048314332962036, |
|
"learning_rate": 1.3631317921347564e-06, |
|
"loss": 0.2469, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.349030470914127, |
|
"grad_norm": 0.4561052620410919, |
|
"learning_rate": 1.297279934454978e-06, |
|
"loss": 0.2363, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.365650969529086, |
|
"grad_norm": 0.4409971237182617, |
|
"learning_rate": 1.2328205183616964e-06, |
|
"loss": 0.2582, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.3822714681440442, |
|
"grad_norm": 0.5186073780059814, |
|
"learning_rate": 1.1697777844051105e-06, |
|
"loss": 0.2354, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.398891966759003, |
|
"grad_norm": 0.4931983947753906, |
|
"learning_rate": 1.1081754403792e-06, |
|
"loss": 0.2628, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.4155124653739612, |
|
"grad_norm": 0.4725812077522278, |
|
"learning_rate": 1.0480366524062041e-06, |
|
"loss": 0.2465, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.4321329639889195, |
|
"grad_norm": 0.459830641746521, |
|
"learning_rate": 9.893840362247809e-07, |
|
"loss": 0.2494, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.4487534626038783, |
|
"grad_norm": 0.45882484316825867, |
|
"learning_rate": 9.322396486851626e-07, |
|
"loss": 0.2572, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.4653739612188366, |
|
"grad_norm": 0.4628044664859772, |
|
"learning_rate": 8.766249794544662e-07, |
|
"loss": 0.2473, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.481994459833795, |
|
"grad_norm": 0.43482884764671326, |
|
"learning_rate": 8.225609429353187e-07, |
|
"loss": 0.2334, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.4986149584487536, |
|
"grad_norm": 0.5092786550521851, |
|
"learning_rate": 7.700678704007947e-07, |
|
"loss": 0.2464, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.515235457063712, |
|
"grad_norm": 0.5002970695495605, |
|
"learning_rate": 7.191655023486682e-07, |
|
"loss": 0.2386, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.5318559556786706, |
|
"grad_norm": 0.44085896015167236, |
|
"learning_rate": 6.698729810778065e-07, |
|
"loss": 0.2231, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.548476454293629, |
|
"grad_norm": 0.4750898480415344, |
|
"learning_rate": 6.222088434895462e-07, |
|
"loss": 0.2746, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.565096952908587, |
|
"grad_norm": 0.5058760643005371, |
|
"learning_rate": 5.76191014116711e-07, |
|
"loss": 0.2753, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.581717451523546, |
|
"grad_norm": 0.4807314872741699, |
|
"learning_rate": 5.318367983829393e-07, |
|
"loss": 0.2295, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.598337950138504, |
|
"grad_norm": 0.4975450336933136, |
|
"learning_rate": 4.891628760948114e-07, |
|
"loss": 0.2623, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.6149584487534625, |
|
"grad_norm": 0.44517505168914795, |
|
"learning_rate": 4.481852951692672e-07, |
|
"loss": 0.2505, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"grad_norm": 0.526871919631958, |
|
"learning_rate": 4.089194655986306e-07, |
|
"loss": 0.2944, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.6481994459833795, |
|
"grad_norm": 0.5860976576805115, |
|
"learning_rate": 3.7138015365554834e-07, |
|
"loss": 0.2929, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.664819944598338, |
|
"grad_norm": 0.5570012927055359, |
|
"learning_rate": 3.355814763399973e-07, |
|
"loss": 0.2669, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 2.6814404432132966, |
|
"grad_norm": 0.46305856108665466, |
|
"learning_rate": 3.015368960704584e-07, |
|
"loss": 0.2464, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 2.698060941828255, |
|
"grad_norm": 0.49931517243385315, |
|
"learning_rate": 2.6925921562124867e-07, |
|
"loss": 0.233, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 2.714681440443213, |
|
"grad_norm": 0.4253719449043274, |
|
"learning_rate": 2.3876057330792344e-07, |
|
"loss": 0.2115, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.731301939058172, |
|
"grad_norm": 0.46956562995910645, |
|
"learning_rate": 2.1005243842255552e-07, |
|
"loss": 0.2419, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 2.74792243767313, |
|
"grad_norm": 0.47405821084976196, |
|
"learning_rate": 1.8314560692059836e-07, |
|
"loss": 0.2442, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.7645429362880884, |
|
"grad_norm": 0.5373594164848328, |
|
"learning_rate": 1.5805019736097105e-07, |
|
"loss": 0.304, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 2.781163434903047, |
|
"grad_norm": 0.49911409616470337, |
|
"learning_rate": 1.3477564710088097e-07, |
|
"loss": 0.2604, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 2.7977839335180055, |
|
"grad_norm": 0.524211585521698, |
|
"learning_rate": 1.1333070874682217e-07, |
|
"loss": 0.2319, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.8144044321329638, |
|
"grad_norm": 0.49799832701683044, |
|
"learning_rate": 9.372344686307655e-08, |
|
"loss": 0.2648, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 2.8310249307479225, |
|
"grad_norm": 0.4979800581932068, |
|
"learning_rate": 7.59612349389599e-08, |
|
"loss": 0.2671, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 2.847645429362881, |
|
"grad_norm": 0.5030661225318909, |
|
"learning_rate": 6.005075261595495e-08, |
|
"loss": 0.2219, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 2.864265927977839, |
|
"grad_norm": 0.4839530885219574, |
|
"learning_rate": 4.599798317577342e-08, |
|
"loss": 0.2981, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 2.880886426592798, |
|
"grad_norm": 0.49113729596138, |
|
"learning_rate": 3.3808211290284886e-08, |
|
"loss": 0.2574, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.897506925207756, |
|
"grad_norm": 0.5154249668121338, |
|
"learning_rate": 2.3486021034170857e-08, |
|
"loss": 0.2584, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 2.914127423822715, |
|
"grad_norm": 0.46952885389328003, |
|
"learning_rate": 1.5035294161039882e-08, |
|
"loss": 0.2785, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 2.930747922437673, |
|
"grad_norm": 0.49860695004463196, |
|
"learning_rate": 8.459208643659122e-09, |
|
"loss": 0.2572, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 2.9473684210526314, |
|
"grad_norm": 0.5341483354568481, |
|
"learning_rate": 3.760237478849793e-09, |
|
"loss": 0.2964, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 2.96398891966759, |
|
"grad_norm": 0.5575993061065674, |
|
"learning_rate": 9.401477574932927e-10, |
|
"loss": 0.2896, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.96398891966759, |
|
"step": 180, |
|
"total_flos": 6.743893969836442e+16, |
|
"train_loss": 0.3866574793226189, |
|
"train_runtime": 24143.75, |
|
"train_samples_per_second": 0.179, |
|
"train_steps_per_second": 0.007 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 180, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.743893969836442e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|