yuzhounie's picture
End of training
da9401d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.96398891966759,
"eval_steps": 500,
"global_step": 180,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01662049861495845,
"grad_norm": 1.9652302265167236,
"learning_rate": 0.0,
"loss": 0.7152,
"step": 1
},
{
"epoch": 0.0332409972299169,
"grad_norm": 2.135629177093506,
"learning_rate": 5.555555555555555e-07,
"loss": 0.7024,
"step": 2
},
{
"epoch": 0.04986149584487535,
"grad_norm": 2.365844964981079,
"learning_rate": 1.111111111111111e-06,
"loss": 0.7755,
"step": 3
},
{
"epoch": 0.0664819944598338,
"grad_norm": 1.939900517463684,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.7134,
"step": 4
},
{
"epoch": 0.08310249307479224,
"grad_norm": 1.8507870435714722,
"learning_rate": 2.222222222222222e-06,
"loss": 0.6644,
"step": 5
},
{
"epoch": 0.0997229916897507,
"grad_norm": 1.8390847444534302,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.7306,
"step": 6
},
{
"epoch": 0.11634349030470914,
"grad_norm": 1.2149966955184937,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.5377,
"step": 7
},
{
"epoch": 0.1329639889196676,
"grad_norm": 1.203329086303711,
"learning_rate": 3.88888888888889e-06,
"loss": 0.6448,
"step": 8
},
{
"epoch": 0.14958448753462603,
"grad_norm": 1.1259090900421143,
"learning_rate": 4.444444444444444e-06,
"loss": 0.6041,
"step": 9
},
{
"epoch": 0.16620498614958448,
"grad_norm": 0.9785488247871399,
"learning_rate": 5e-06,
"loss": 0.6802,
"step": 10
},
{
"epoch": 0.18282548476454294,
"grad_norm": 0.7702904343605042,
"learning_rate": 5.555555555555557e-06,
"loss": 0.5737,
"step": 11
},
{
"epoch": 0.1994459833795014,
"grad_norm": 0.7972448468208313,
"learning_rate": 6.111111111111112e-06,
"loss": 0.6071,
"step": 12
},
{
"epoch": 0.21606648199445982,
"grad_norm": 0.8643639087677002,
"learning_rate": 6.666666666666667e-06,
"loss": 0.5645,
"step": 13
},
{
"epoch": 0.23268698060941828,
"grad_norm": 0.822340190410614,
"learning_rate": 7.222222222222223e-06,
"loss": 0.5512,
"step": 14
},
{
"epoch": 0.24930747922437674,
"grad_norm": 1.0604660511016846,
"learning_rate": 7.77777777777778e-06,
"loss": 0.5875,
"step": 15
},
{
"epoch": 0.2659279778393352,
"grad_norm": 0.8126739263534546,
"learning_rate": 8.333333333333334e-06,
"loss": 0.5601,
"step": 16
},
{
"epoch": 0.28254847645429365,
"grad_norm": 0.7240079641342163,
"learning_rate": 8.888888888888888e-06,
"loss": 0.5724,
"step": 17
},
{
"epoch": 0.29916897506925205,
"grad_norm": 0.6566236615180969,
"learning_rate": 9.444444444444445e-06,
"loss": 0.5535,
"step": 18
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.7229272723197937,
"learning_rate": 1e-05,
"loss": 0.5413,
"step": 19
},
{
"epoch": 0.33240997229916897,
"grad_norm": 0.6160261034965515,
"learning_rate": 9.999059852242508e-06,
"loss": 0.4809,
"step": 20
},
{
"epoch": 0.3490304709141274,
"grad_norm": 0.5426657199859619,
"learning_rate": 9.996239762521152e-06,
"loss": 0.4453,
"step": 21
},
{
"epoch": 0.3656509695290859,
"grad_norm": 0.6986624002456665,
"learning_rate": 9.991540791356342e-06,
"loss": 0.5704,
"step": 22
},
{
"epoch": 0.38227146814404434,
"grad_norm": 0.6466948986053467,
"learning_rate": 9.98496470583896e-06,
"loss": 0.5222,
"step": 23
},
{
"epoch": 0.3988919667590028,
"grad_norm": 0.5881003141403198,
"learning_rate": 9.976513978965829e-06,
"loss": 0.4903,
"step": 24
},
{
"epoch": 0.4155124653739612,
"grad_norm": 0.5835773348808289,
"learning_rate": 9.966191788709716e-06,
"loss": 0.4936,
"step": 25
},
{
"epoch": 0.43213296398891965,
"grad_norm": 0.5974717736244202,
"learning_rate": 9.954002016824226e-06,
"loss": 0.544,
"step": 26
},
{
"epoch": 0.4487534626038781,
"grad_norm": 0.6126233339309692,
"learning_rate": 9.939949247384046e-06,
"loss": 0.5313,
"step": 27
},
{
"epoch": 0.46537396121883656,
"grad_norm": 0.5605891942977905,
"learning_rate": 9.924038765061042e-06,
"loss": 0.5121,
"step": 28
},
{
"epoch": 0.481994459833795,
"grad_norm": 0.523395299911499,
"learning_rate": 9.906276553136924e-06,
"loss": 0.4705,
"step": 29
},
{
"epoch": 0.4986149584487535,
"grad_norm": 0.5597982406616211,
"learning_rate": 9.886669291253178e-06,
"loss": 0.4951,
"step": 30
},
{
"epoch": 0.5152354570637119,
"grad_norm": 0.5273374915122986,
"learning_rate": 9.86522435289912e-06,
"loss": 0.4763,
"step": 31
},
{
"epoch": 0.5318559556786704,
"grad_norm": 0.5255304574966431,
"learning_rate": 9.841949802639031e-06,
"loss": 0.5133,
"step": 32
},
{
"epoch": 0.5484764542936288,
"grad_norm": 0.8223831057548523,
"learning_rate": 9.816854393079402e-06,
"loss": 0.4865,
"step": 33
},
{
"epoch": 0.5650969529085873,
"grad_norm": 0.4619203805923462,
"learning_rate": 9.789947561577445e-06,
"loss": 0.4631,
"step": 34
},
{
"epoch": 0.5817174515235457,
"grad_norm": 0.4974648654460907,
"learning_rate": 9.761239426692077e-06,
"loss": 0.5039,
"step": 35
},
{
"epoch": 0.5983379501385041,
"grad_norm": 0.5178198218345642,
"learning_rate": 9.730740784378755e-06,
"loss": 0.4618,
"step": 36
},
{
"epoch": 0.6149584487534626,
"grad_norm": 0.5592218637466431,
"learning_rate": 9.698463103929542e-06,
"loss": 0.4777,
"step": 37
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.4956098198890686,
"learning_rate": 9.664418523660004e-06,
"loss": 0.4925,
"step": 38
},
{
"epoch": 0.6481994459833795,
"grad_norm": 0.48805150389671326,
"learning_rate": 9.628619846344453e-06,
"loss": 0.4423,
"step": 39
},
{
"epoch": 0.6648199445983379,
"grad_norm": 0.5749639868736267,
"learning_rate": 9.591080534401371e-06,
"loss": 0.55,
"step": 40
},
{
"epoch": 0.6814404432132964,
"grad_norm": 0.7393980622291565,
"learning_rate": 9.551814704830734e-06,
"loss": 0.426,
"step": 41
},
{
"epoch": 0.6980609418282548,
"grad_norm": 0.5011327862739563,
"learning_rate": 9.51083712390519e-06,
"loss": 0.4628,
"step": 42
},
{
"epoch": 0.7146814404432132,
"grad_norm": 0.572926938533783,
"learning_rate": 9.468163201617063e-06,
"loss": 0.527,
"step": 43
},
{
"epoch": 0.7313019390581718,
"grad_norm": 0.5243227481842041,
"learning_rate": 9.423808985883289e-06,
"loss": 0.5115,
"step": 44
},
{
"epoch": 0.7479224376731302,
"grad_norm": 0.5271593928337097,
"learning_rate": 9.377791156510456e-06,
"loss": 0.4921,
"step": 45
},
{
"epoch": 0.7645429362880887,
"grad_norm": 0.5143831968307495,
"learning_rate": 9.330127018922195e-06,
"loss": 0.4842,
"step": 46
},
{
"epoch": 0.7811634349030471,
"grad_norm": 0.5135733485221863,
"learning_rate": 9.280834497651334e-06,
"loss": 0.4939,
"step": 47
},
{
"epoch": 0.7977839335180056,
"grad_norm": 0.5173041820526123,
"learning_rate": 9.229932129599206e-06,
"loss": 0.4819,
"step": 48
},
{
"epoch": 0.814404432132964,
"grad_norm": 0.570851743221283,
"learning_rate": 9.177439057064684e-06,
"loss": 0.5439,
"step": 49
},
{
"epoch": 0.8310249307479224,
"grad_norm": 0.552671492099762,
"learning_rate": 9.123375020545534e-06,
"loss": 0.4669,
"step": 50
},
{
"epoch": 0.8476454293628809,
"grad_norm": 0.5668032765388489,
"learning_rate": 9.067760351314838e-06,
"loss": 0.5138,
"step": 51
},
{
"epoch": 0.8642659279778393,
"grad_norm": 0.48532989621162415,
"learning_rate": 9.01061596377522e-06,
"loss": 0.4827,
"step": 52
},
{
"epoch": 0.8808864265927978,
"grad_norm": 0.4953126311302185,
"learning_rate": 8.951963347593797e-06,
"loss": 0.4273,
"step": 53
},
{
"epoch": 0.8975069252077562,
"grad_norm": 0.5042351484298706,
"learning_rate": 8.891824559620801e-06,
"loss": 0.5311,
"step": 54
},
{
"epoch": 0.9141274238227147,
"grad_norm": 0.532244086265564,
"learning_rate": 8.83022221559489e-06,
"loss": 0.5364,
"step": 55
},
{
"epoch": 0.9307479224376731,
"grad_norm": 0.5507211089134216,
"learning_rate": 8.767179481638303e-06,
"loss": 0.5264,
"step": 56
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.5117627382278442,
"learning_rate": 8.702720065545024e-06,
"loss": 0.4994,
"step": 57
},
{
"epoch": 0.96398891966759,
"grad_norm": 0.6424684524536133,
"learning_rate": 8.636868207865244e-06,
"loss": 0.5321,
"step": 58
},
{
"epoch": 0.9806094182825484,
"grad_norm": 0.5632804036140442,
"learning_rate": 8.569648672789496e-06,
"loss": 0.5354,
"step": 59
},
{
"epoch": 0.997229916897507,
"grad_norm": 0.5519580841064453,
"learning_rate": 8.501086738835843e-06,
"loss": 0.5502,
"step": 60
},
{
"epoch": 1.0,
"grad_norm": 0.5519580841064453,
"learning_rate": 8.43120818934367e-06,
"loss": 0.4298,
"step": 61
},
{
"epoch": 1.0166204986149585,
"grad_norm": 1.4024403095245361,
"learning_rate": 8.360039302777614e-06,
"loss": 0.3848,
"step": 62
},
{
"epoch": 1.0332409972299168,
"grad_norm": 0.4745033085346222,
"learning_rate": 8.28760684284532e-06,
"loss": 0.4,
"step": 63
},
{
"epoch": 1.0498614958448753,
"grad_norm": 0.5079669952392578,
"learning_rate": 8.213938048432697e-06,
"loss": 0.3824,
"step": 64
},
{
"epoch": 1.0664819944598338,
"grad_norm": 0.49697190523147583,
"learning_rate": 8.139060623360494e-06,
"loss": 0.4243,
"step": 65
},
{
"epoch": 1.0831024930747923,
"grad_norm": 0.4616394639015198,
"learning_rate": 8.063002725966014e-06,
"loss": 0.3888,
"step": 66
},
{
"epoch": 1.0997229916897506,
"grad_norm": 0.4260391294956207,
"learning_rate": 7.985792958513932e-06,
"loss": 0.3406,
"step": 67
},
{
"epoch": 1.1163434903047091,
"grad_norm": 0.47153493762016296,
"learning_rate": 7.907460356440133e-06,
"loss": 0.3636,
"step": 68
},
{
"epoch": 1.1329639889196677,
"grad_norm": 0.5076174139976501,
"learning_rate": 7.828034377432694e-06,
"loss": 0.4166,
"step": 69
},
{
"epoch": 1.149584487534626,
"grad_norm": 0.5310080647468567,
"learning_rate": 7.747544890354031e-06,
"loss": 0.4311,
"step": 70
},
{
"epoch": 1.1662049861495845,
"grad_norm": 0.5010002851486206,
"learning_rate": 7.666022164008458e-06,
"loss": 0.3193,
"step": 71
},
{
"epoch": 1.182825484764543,
"grad_norm": 0.49259936809539795,
"learning_rate": 7.5834968557593155e-06,
"loss": 0.3456,
"step": 72
},
{
"epoch": 1.1994459833795015,
"grad_norm": 0.5213885307312012,
"learning_rate": 7.500000000000001e-06,
"loss": 0.3615,
"step": 73
},
{
"epoch": 1.2160664819944598,
"grad_norm": 0.512752115726471,
"learning_rate": 7.415562996483193e-06,
"loss": 0.3569,
"step": 74
},
{
"epoch": 1.2326869806094183,
"grad_norm": 0.5139035582542419,
"learning_rate": 7.330217598512696e-06,
"loss": 0.3859,
"step": 75
},
{
"epoch": 1.2493074792243768,
"grad_norm": 0.5561084151268005,
"learning_rate": 7.243995901002312e-06,
"loss": 0.363,
"step": 76
},
{
"epoch": 1.2659279778393353,
"grad_norm": 0.49844229221343994,
"learning_rate": 7.156930328406268e-06,
"loss": 0.3648,
"step": 77
},
{
"epoch": 1.2825484764542936,
"grad_norm": 0.5111745595932007,
"learning_rate": 7.069053622525697e-06,
"loss": 0.3453,
"step": 78
},
{
"epoch": 1.299168975069252,
"grad_norm": 0.5968831777572632,
"learning_rate": 6.980398830195785e-06,
"loss": 0.3601,
"step": 79
},
{
"epoch": 1.3157894736842106,
"grad_norm": 0.3998188376426697,
"learning_rate": 6.890999290858213e-06,
"loss": 0.2965,
"step": 80
},
{
"epoch": 1.332409972299169,
"grad_norm": 0.5044348239898682,
"learning_rate": 6.800888624023552e-06,
"loss": 0.3579,
"step": 81
},
{
"epoch": 1.3490304709141274,
"grad_norm": 0.499636709690094,
"learning_rate": 6.710100716628345e-06,
"loss": 0.3751,
"step": 82
},
{
"epoch": 1.365650969529086,
"grad_norm": 0.5045871734619141,
"learning_rate": 6.618669710291607e-06,
"loss": 0.3782,
"step": 83
},
{
"epoch": 1.3822714681440442,
"grad_norm": 0.5296726822853088,
"learning_rate": 6.526629988475567e-06,
"loss": 0.413,
"step": 84
},
{
"epoch": 1.3988919667590027,
"grad_norm": 0.5541542768478394,
"learning_rate": 6.434016163555452e-06,
"loss": 0.4176,
"step": 85
},
{
"epoch": 1.4155124653739612,
"grad_norm": 0.52264803647995,
"learning_rate": 6.340863063803187e-06,
"loss": 0.3687,
"step": 86
},
{
"epoch": 1.4321329639889195,
"grad_norm": 0.5726013779640198,
"learning_rate": 6.247205720289907e-06,
"loss": 0.4127,
"step": 87
},
{
"epoch": 1.448753462603878,
"grad_norm": 0.5129911303520203,
"learning_rate": 6.153079353712201e-06,
"loss": 0.3608,
"step": 88
},
{
"epoch": 1.4653739612188366,
"grad_norm": 0.5869404673576355,
"learning_rate": 6.058519361147055e-06,
"loss": 0.369,
"step": 89
},
{
"epoch": 1.481994459833795,
"grad_norm": 0.4603992998600006,
"learning_rate": 5.9635613027404495e-06,
"loss": 0.2792,
"step": 90
},
{
"epoch": 1.4986149584487536,
"grad_norm": 0.433829128742218,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.2935,
"step": 91
},
{
"epoch": 1.5152354570637119,
"grad_norm": 0.4892548620700836,
"learning_rate": 5.772593964039203e-06,
"loss": 0.3591,
"step": 92
},
{
"epoch": 1.5318559556786704,
"grad_norm": 0.4414325952529907,
"learning_rate": 5.6766564987506564e-06,
"loss": 0.3312,
"step": 93
},
{
"epoch": 1.548476454293629,
"grad_norm": 0.5104185938835144,
"learning_rate": 5.5804645706261515e-06,
"loss": 0.3524,
"step": 94
},
{
"epoch": 1.5650969529085872,
"grad_norm": 0.46491438150405884,
"learning_rate": 5.484054353515896e-06,
"loss": 0.3127,
"step": 95
},
{
"epoch": 1.5817174515235457,
"grad_norm": 0.5037529468536377,
"learning_rate": 5.387462103359655e-06,
"loss": 0.3549,
"step": 96
},
{
"epoch": 1.5983379501385042,
"grad_norm": 0.456927090883255,
"learning_rate": 5.290724144552379e-06,
"loss": 0.3583,
"step": 97
},
{
"epoch": 1.6149584487534625,
"grad_norm": 0.48146891593933105,
"learning_rate": 5.193876856284085e-06,
"loss": 0.3485,
"step": 98
},
{
"epoch": 1.631578947368421,
"grad_norm": 0.45695117115974426,
"learning_rate": 5.096956658859122e-06,
"loss": 0.3325,
"step": 99
},
{
"epoch": 1.6481994459833795,
"grad_norm": 0.46289077401161194,
"learning_rate": 5e-06,
"loss": 0.3461,
"step": 100
},
{
"epoch": 1.6648199445983378,
"grad_norm": 0.5340746641159058,
"learning_rate": 4.903043341140879e-06,
"loss": 0.3856,
"step": 101
},
{
"epoch": 1.6814404432132966,
"grad_norm": 0.433956503868103,
"learning_rate": 4.806123143715916e-06,
"loss": 0.3166,
"step": 102
},
{
"epoch": 1.6980609418282548,
"grad_norm": 0.4446304440498352,
"learning_rate": 4.7092758554476215e-06,
"loss": 0.3378,
"step": 103
},
{
"epoch": 1.7146814404432131,
"grad_norm": 0.5027093291282654,
"learning_rate": 4.6125378966403465e-06,
"loss": 0.3915,
"step": 104
},
{
"epoch": 1.7313019390581719,
"grad_norm": 0.5546647310256958,
"learning_rate": 4.515945646484105e-06,
"loss": 0.3484,
"step": 105
},
{
"epoch": 1.7479224376731302,
"grad_norm": 0.49674123525619507,
"learning_rate": 4.4195354293738484e-06,
"loss": 0.3501,
"step": 106
},
{
"epoch": 1.7645429362880887,
"grad_norm": 0.5134773850440979,
"learning_rate": 4.323343501249346e-06,
"loss": 0.3818,
"step": 107
},
{
"epoch": 1.7811634349030472,
"grad_norm": 0.5111790299415588,
"learning_rate": 4.227406035960798e-06,
"loss": 0.4027,
"step": 108
},
{
"epoch": 1.7977839335180055,
"grad_norm": 0.5103554129600525,
"learning_rate": 4.131759111665349e-06,
"loss": 0.3295,
"step": 109
},
{
"epoch": 1.814404432132964,
"grad_norm": 0.48488280177116394,
"learning_rate": 4.036438697259551e-06,
"loss": 0.3339,
"step": 110
},
{
"epoch": 1.8310249307479225,
"grad_norm": 0.4840296506881714,
"learning_rate": 3.941480638852948e-06,
"loss": 0.3519,
"step": 111
},
{
"epoch": 1.8476454293628808,
"grad_norm": 0.4919949471950531,
"learning_rate": 3.8469206462878e-06,
"loss": 0.328,
"step": 112
},
{
"epoch": 1.8642659279778393,
"grad_norm": 0.5291365385055542,
"learning_rate": 3.752794279710094e-06,
"loss": 0.3753,
"step": 113
},
{
"epoch": 1.8808864265927978,
"grad_norm": 0.4807715117931366,
"learning_rate": 3.6591369361968127e-06,
"loss": 0.393,
"step": 114
},
{
"epoch": 1.897506925207756,
"grad_norm": 0.4700012803077698,
"learning_rate": 3.5659838364445505e-06,
"loss": 0.3182,
"step": 115
},
{
"epoch": 1.9141274238227148,
"grad_norm": 1.0692706108093262,
"learning_rate": 3.473370011524435e-06,
"loss": 0.3463,
"step": 116
},
{
"epoch": 1.9307479224376731,
"grad_norm": 0.49183958768844604,
"learning_rate": 3.3813302897083955e-06,
"loss": 0.3694,
"step": 117
},
{
"epoch": 1.9473684210526314,
"grad_norm": 0.5577133893966675,
"learning_rate": 3.289899283371657e-06,
"loss": 0.3693,
"step": 118
},
{
"epoch": 1.9639889196675901,
"grad_norm": 0.47118237614631653,
"learning_rate": 3.1991113759764493e-06,
"loss": 0.3325,
"step": 119
},
{
"epoch": 1.9806094182825484,
"grad_norm": 0.44954901933670044,
"learning_rate": 3.1090007091417884e-06,
"loss": 0.3497,
"step": 120
},
{
"epoch": 1.997229916897507,
"grad_norm": 0.5316449403762817,
"learning_rate": 3.019601169804216e-06,
"loss": 0.4239,
"step": 121
},
{
"epoch": 2.0,
"grad_norm": 0.5316449403762817,
"learning_rate": 2.9309463774743047e-06,
"loss": 0.302,
"step": 122
},
{
"epoch": 2.0166204986149583,
"grad_norm": 1.3086326122283936,
"learning_rate": 2.843069671593734e-06,
"loss": 0.2255,
"step": 123
},
{
"epoch": 2.033240997229917,
"grad_norm": 0.4746488928794861,
"learning_rate": 2.7560040989976894e-06,
"loss": 0.2275,
"step": 124
},
{
"epoch": 2.0498614958448753,
"grad_norm": 0.4944143295288086,
"learning_rate": 2.6697824014873076e-06,
"loss": 0.2648,
"step": 125
},
{
"epoch": 2.0664819944598336,
"grad_norm": 0.5195774435997009,
"learning_rate": 2.5844370035168077e-06,
"loss": 0.2707,
"step": 126
},
{
"epoch": 2.0831024930747923,
"grad_norm": 0.885553240776062,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.2764,
"step": 127
},
{
"epoch": 2.0997229916897506,
"grad_norm": 0.5028234124183655,
"learning_rate": 2.4165031442406857e-06,
"loss": 0.2503,
"step": 128
},
{
"epoch": 2.1163434903047094,
"grad_norm": 0.4780957102775574,
"learning_rate": 2.333977835991545e-06,
"loss": 0.2406,
"step": 129
},
{
"epoch": 2.1329639889196677,
"grad_norm": 0.46052825450897217,
"learning_rate": 2.2524551096459703e-06,
"loss": 0.2155,
"step": 130
},
{
"epoch": 2.149584487534626,
"grad_norm": 0.6180452704429626,
"learning_rate": 2.171965622567308e-06,
"loss": 0.2787,
"step": 131
},
{
"epoch": 2.1662049861495847,
"grad_norm": 0.6939100027084351,
"learning_rate": 2.0925396435598665e-06,
"loss": 0.246,
"step": 132
},
{
"epoch": 2.182825484764543,
"grad_norm": 0.6042692065238953,
"learning_rate": 2.0142070414860704e-06,
"loss": 0.2609,
"step": 133
},
{
"epoch": 2.1994459833795013,
"grad_norm": 0.7851183414459229,
"learning_rate": 1.936997274033986e-06,
"loss": 0.2876,
"step": 134
},
{
"epoch": 2.21606648199446,
"grad_norm": 0.5801565051078796,
"learning_rate": 1.8609393766395083e-06,
"loss": 0.288,
"step": 135
},
{
"epoch": 2.2326869806094183,
"grad_norm": 0.5398533940315247,
"learning_rate": 1.7860619515673034e-06,
"loss": 0.2958,
"step": 136
},
{
"epoch": 2.2493074792243766,
"grad_norm": 0.48142921924591064,
"learning_rate": 1.7123931571546826e-06,
"loss": 0.2506,
"step": 137
},
{
"epoch": 2.2659279778393353,
"grad_norm": 0.48484477400779724,
"learning_rate": 1.639960697222388e-06,
"loss": 0.2166,
"step": 138
},
{
"epoch": 2.2825484764542936,
"grad_norm": 0.4676513075828552,
"learning_rate": 1.5687918106563326e-06,
"loss": 0.2558,
"step": 139
},
{
"epoch": 2.299168975069252,
"grad_norm": 0.5008206963539124,
"learning_rate": 1.4989132611641576e-06,
"loss": 0.2315,
"step": 140
},
{
"epoch": 2.3157894736842106,
"grad_norm": 0.5055615901947021,
"learning_rate": 1.4303513272105057e-06,
"loss": 0.278,
"step": 141
},
{
"epoch": 2.332409972299169,
"grad_norm": 0.5048314332962036,
"learning_rate": 1.3631317921347564e-06,
"loss": 0.2469,
"step": 142
},
{
"epoch": 2.349030470914127,
"grad_norm": 0.4561052620410919,
"learning_rate": 1.297279934454978e-06,
"loss": 0.2363,
"step": 143
},
{
"epoch": 2.365650969529086,
"grad_norm": 0.4409971237182617,
"learning_rate": 1.2328205183616964e-06,
"loss": 0.2582,
"step": 144
},
{
"epoch": 2.3822714681440442,
"grad_norm": 0.5186073780059814,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.2354,
"step": 145
},
{
"epoch": 2.398891966759003,
"grad_norm": 0.4931983947753906,
"learning_rate": 1.1081754403792e-06,
"loss": 0.2628,
"step": 146
},
{
"epoch": 2.4155124653739612,
"grad_norm": 0.4725812077522278,
"learning_rate": 1.0480366524062041e-06,
"loss": 0.2465,
"step": 147
},
{
"epoch": 2.4321329639889195,
"grad_norm": 0.459830641746521,
"learning_rate": 9.893840362247809e-07,
"loss": 0.2494,
"step": 148
},
{
"epoch": 2.4487534626038783,
"grad_norm": 0.45882484316825867,
"learning_rate": 9.322396486851626e-07,
"loss": 0.2572,
"step": 149
},
{
"epoch": 2.4653739612188366,
"grad_norm": 0.4628044664859772,
"learning_rate": 8.766249794544662e-07,
"loss": 0.2473,
"step": 150
},
{
"epoch": 2.481994459833795,
"grad_norm": 0.43482884764671326,
"learning_rate": 8.225609429353187e-07,
"loss": 0.2334,
"step": 151
},
{
"epoch": 2.4986149584487536,
"grad_norm": 0.5092786550521851,
"learning_rate": 7.700678704007947e-07,
"loss": 0.2464,
"step": 152
},
{
"epoch": 2.515235457063712,
"grad_norm": 0.5002970695495605,
"learning_rate": 7.191655023486682e-07,
"loss": 0.2386,
"step": 153
},
{
"epoch": 2.5318559556786706,
"grad_norm": 0.44085896015167236,
"learning_rate": 6.698729810778065e-07,
"loss": 0.2231,
"step": 154
},
{
"epoch": 2.548476454293629,
"grad_norm": 0.4750898480415344,
"learning_rate": 6.222088434895462e-07,
"loss": 0.2746,
"step": 155
},
{
"epoch": 2.565096952908587,
"grad_norm": 0.5058760643005371,
"learning_rate": 5.76191014116711e-07,
"loss": 0.2753,
"step": 156
},
{
"epoch": 2.581717451523546,
"grad_norm": 0.4807314872741699,
"learning_rate": 5.318367983829393e-07,
"loss": 0.2295,
"step": 157
},
{
"epoch": 2.598337950138504,
"grad_norm": 0.4975450336933136,
"learning_rate": 4.891628760948114e-07,
"loss": 0.2623,
"step": 158
},
{
"epoch": 2.6149584487534625,
"grad_norm": 0.44517505168914795,
"learning_rate": 4.481852951692672e-07,
"loss": 0.2505,
"step": 159
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.526871919631958,
"learning_rate": 4.089194655986306e-07,
"loss": 0.2944,
"step": 160
},
{
"epoch": 2.6481994459833795,
"grad_norm": 0.5860976576805115,
"learning_rate": 3.7138015365554834e-07,
"loss": 0.2929,
"step": 161
},
{
"epoch": 2.664819944598338,
"grad_norm": 0.5570012927055359,
"learning_rate": 3.355814763399973e-07,
"loss": 0.2669,
"step": 162
},
{
"epoch": 2.6814404432132966,
"grad_norm": 0.46305856108665466,
"learning_rate": 3.015368960704584e-07,
"loss": 0.2464,
"step": 163
},
{
"epoch": 2.698060941828255,
"grad_norm": 0.49931517243385315,
"learning_rate": 2.6925921562124867e-07,
"loss": 0.233,
"step": 164
},
{
"epoch": 2.714681440443213,
"grad_norm": 0.4253719449043274,
"learning_rate": 2.3876057330792344e-07,
"loss": 0.2115,
"step": 165
},
{
"epoch": 2.731301939058172,
"grad_norm": 0.46956562995910645,
"learning_rate": 2.1005243842255552e-07,
"loss": 0.2419,
"step": 166
},
{
"epoch": 2.74792243767313,
"grad_norm": 0.47405821084976196,
"learning_rate": 1.8314560692059836e-07,
"loss": 0.2442,
"step": 167
},
{
"epoch": 2.7645429362880884,
"grad_norm": 0.5373594164848328,
"learning_rate": 1.5805019736097105e-07,
"loss": 0.304,
"step": 168
},
{
"epoch": 2.781163434903047,
"grad_norm": 0.49911409616470337,
"learning_rate": 1.3477564710088097e-07,
"loss": 0.2604,
"step": 169
},
{
"epoch": 2.7977839335180055,
"grad_norm": 0.524211585521698,
"learning_rate": 1.1333070874682217e-07,
"loss": 0.2319,
"step": 170
},
{
"epoch": 2.8144044321329638,
"grad_norm": 0.49799832701683044,
"learning_rate": 9.372344686307655e-08,
"loss": 0.2648,
"step": 171
},
{
"epoch": 2.8310249307479225,
"grad_norm": 0.4979800581932068,
"learning_rate": 7.59612349389599e-08,
"loss": 0.2671,
"step": 172
},
{
"epoch": 2.847645429362881,
"grad_norm": 0.5030661225318909,
"learning_rate": 6.005075261595495e-08,
"loss": 0.2219,
"step": 173
},
{
"epoch": 2.864265927977839,
"grad_norm": 0.4839530885219574,
"learning_rate": 4.599798317577342e-08,
"loss": 0.2981,
"step": 174
},
{
"epoch": 2.880886426592798,
"grad_norm": 0.49113729596138,
"learning_rate": 3.3808211290284886e-08,
"loss": 0.2574,
"step": 175
},
{
"epoch": 2.897506925207756,
"grad_norm": 0.5154249668121338,
"learning_rate": 2.3486021034170857e-08,
"loss": 0.2584,
"step": 176
},
{
"epoch": 2.914127423822715,
"grad_norm": 0.46952885389328003,
"learning_rate": 1.5035294161039882e-08,
"loss": 0.2785,
"step": 177
},
{
"epoch": 2.930747922437673,
"grad_norm": 0.49860695004463196,
"learning_rate": 8.459208643659122e-09,
"loss": 0.2572,
"step": 178
},
{
"epoch": 2.9473684210526314,
"grad_norm": 0.5341483354568481,
"learning_rate": 3.760237478849793e-09,
"loss": 0.2964,
"step": 179
},
{
"epoch": 2.96398891966759,
"grad_norm": 0.5575993061065674,
"learning_rate": 9.401477574932927e-10,
"loss": 0.2896,
"step": 180
},
{
"epoch": 2.96398891966759,
"step": 180,
"total_flos": 6.743893969836442e+16,
"train_loss": 0.3866574793226189,
"train_runtime": 24143.75,
"train_samples_per_second": 0.179,
"train_steps_per_second": 0.007
}
],
"logging_steps": 1,
"max_steps": 180,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.743893969836442e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}