llama3-1b-coding-gpt4o-100k2 / trainer_state.json
chansung's picture
Model save
3888f2b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 340,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.029411764705882353,
"grad_norm": 2.488327741622925,
"learning_rate": 5.882352941176471e-05,
"loss": 1.9016,
"step": 1
},
{
"epoch": 0.14705882352941177,
"grad_norm": 1.427666425704956,
"learning_rate": 0.00029411764705882356,
"loss": 1.8391,
"step": 5
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.6930230259895325,
"learning_rate": 0.0005882352941176471,
"loss": 1.5625,
"step": 10
},
{
"epoch": 0.4411764705882353,
"grad_norm": 0.2596357762813568,
"learning_rate": 0.0008823529411764706,
"loss": 1.3849,
"step": 15
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.16956600546836853,
"learning_rate": 0.0011764705882352942,
"loss": 1.2969,
"step": 20
},
{
"epoch": 0.7352941176470589,
"grad_norm": 0.12523840367794037,
"learning_rate": 0.0014705882352941178,
"loss": 1.2405,
"step": 25
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.09968027472496033,
"learning_rate": 0.0017647058823529412,
"loss": 1.2024,
"step": 30
},
{
"epoch": 1.0,
"eval_loss": 1.7381153106689453,
"eval_runtime": 0.828,
"eval_samples_per_second": 4.831,
"eval_steps_per_second": 1.208,
"step": 34
},
{
"epoch": 1.0294117647058822,
"grad_norm": 0.366025447845459,
"learning_rate": 0.001999947298487173,
"loss": 1.1724,
"step": 35
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.12924526631832123,
"learning_rate": 0.0019981033287370443,
"loss": 1.1507,
"step": 40
},
{
"epoch": 1.3235294117647058,
"grad_norm": 0.15898454189300537,
"learning_rate": 0.0019936298356132177,
"loss": 1.1306,
"step": 45
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.09307724982500076,
"learning_rate": 0.0019865386046236597,
"loss": 1.1156,
"step": 50
},
{
"epoch": 1.6176470588235294,
"grad_norm": 0.0881420224905014,
"learning_rate": 0.001976848317759601,
"loss": 1.1038,
"step": 55
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.08049994707107544,
"learning_rate": 0.0019645845042774554,
"loss": 1.0939,
"step": 60
},
{
"epoch": 1.9117647058823528,
"grad_norm": 0.07463372498750687,
"learning_rate": 0.001949779473441478,
"loss": 1.0846,
"step": 65
},
{
"epoch": 2.0,
"eval_loss": 1.6922645568847656,
"eval_runtime": 0.8305,
"eval_samples_per_second": 4.817,
"eval_steps_per_second": 1.204,
"step": 68
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.09218032658100128,
"learning_rate": 0.0019324722294043557,
"loss": 1.0691,
"step": 70
},
{
"epoch": 2.2058823529411766,
"grad_norm": 0.09160086512565613,
"learning_rate": 0.0019127083684499803,
"loss": 1.0627,
"step": 75
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.08734273165464401,
"learning_rate": 0.0018905399588691164,
"loss": 1.0554,
"step": 80
},
{
"epoch": 2.5,
"grad_norm": 0.16841983795166016,
"learning_rate": 0.001866025403784439,
"loss": 1.0504,
"step": 85
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.0975656732916832,
"learning_rate": 0.0018392292872863268,
"loss": 1.0538,
"step": 90
},
{
"epoch": 2.7941176470588234,
"grad_norm": 0.08383582532405853,
"learning_rate": 0.0018102222042847736,
"loss": 1.0443,
"step": 95
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.09132234007120132,
"learning_rate": 0.0017790805745256705,
"loss": 1.0447,
"step": 100
},
{
"epoch": 3.0,
"eval_loss": 1.6731293201446533,
"eval_runtime": 0.8295,
"eval_samples_per_second": 4.822,
"eval_steps_per_second": 1.206,
"step": 102
},
{
"epoch": 3.088235294117647,
"grad_norm": 0.09759514033794403,
"learning_rate": 0.0017458864412614435,
"loss": 1.0375,
"step": 105
},
{
"epoch": 3.235294117647059,
"grad_norm": 0.09400475025177002,
"learning_rate": 0.0017107272551064472,
"loss": 1.0277,
"step": 110
},
{
"epoch": 3.3823529411764706,
"grad_norm": 0.07706195116043091,
"learning_rate": 0.0016736956436465573,
"loss": 1.0239,
"step": 115
},
{
"epoch": 3.5294117647058822,
"grad_norm": 0.08647977560758591,
"learning_rate": 0.0016348891674099228,
"loss": 1.0229,
"step": 120
},
{
"epoch": 3.6764705882352944,
"grad_norm": 0.08289068937301636,
"learning_rate": 0.0015944100628417868,
"loss": 1.0238,
"step": 125
},
{
"epoch": 3.8235294117647056,
"grad_norm": 0.08053147792816162,
"learning_rate": 0.0015523649729605059,
"loss": 1.0195,
"step": 130
},
{
"epoch": 3.9705882352941178,
"grad_norm": 0.08788559585809708,
"learning_rate": 0.001508864666404365,
"loss": 1.0207,
"step": 135
},
{
"epoch": 4.0,
"eval_loss": 1.6660211086273193,
"eval_runtime": 0.8294,
"eval_samples_per_second": 4.823,
"eval_steps_per_second": 1.206,
"step": 136
},
{
"epoch": 4.117647058823529,
"grad_norm": 0.0886395052075386,
"learning_rate": 0.0014640237456093634,
"loss": 1.007,
"step": 140
},
{
"epoch": 4.264705882352941,
"grad_norm": 0.08254272490739822,
"learning_rate": 0.0014179603448867834,
"loss": 1.0049,
"step": 145
},
{
"epoch": 4.411764705882353,
"grad_norm": 0.08430016040802002,
"learning_rate": 0.0013707958191959608,
"loss": 1.0103,
"step": 150
},
{
"epoch": 4.5588235294117645,
"grad_norm": 0.0816187709569931,
"learning_rate": 0.001322654424432195,
"loss": 1.0011,
"step": 155
},
{
"epoch": 4.705882352941177,
"grad_norm": 0.08980533480644226,
"learning_rate": 0.0012736629900720832,
"loss": 1.0107,
"step": 160
},
{
"epoch": 4.852941176470588,
"grad_norm": 0.08757297694683075,
"learning_rate": 0.0012239505850387032,
"loss": 1.0035,
"step": 165
},
{
"epoch": 5.0,
"grad_norm": 0.08535553514957428,
"learning_rate": 0.0011736481776669307,
"loss": 1.0039,
"step": 170
},
{
"epoch": 5.0,
"eval_loss": 1.668144941329956,
"eval_runtime": 0.8286,
"eval_samples_per_second": 4.827,
"eval_steps_per_second": 1.207,
"step": 170
},
{
"epoch": 5.147058823529412,
"grad_norm": 0.08561732620000839,
"learning_rate": 0.0011228882906647141,
"loss": 0.9888,
"step": 175
},
{
"epoch": 5.294117647058823,
"grad_norm": 0.08237646520137787,
"learning_rate": 0.0010718046519793277,
"loss": 0.994,
"step": 180
},
{
"epoch": 5.4411764705882355,
"grad_norm": 0.07582972943782806,
"learning_rate": 0.0010205318424883906,
"loss": 0.9953,
"step": 185
},
{
"epoch": 5.588235294117647,
"grad_norm": 0.07832607626914978,
"learning_rate": 0.0009692049414438299,
"loss": 0.991,
"step": 190
},
{
"epoch": 5.735294117647059,
"grad_norm": 0.08117620646953583,
"learning_rate": 0.0009179591706028624,
"loss": 0.9931,
"step": 195
},
{
"epoch": 5.882352941176471,
"grad_norm": 0.07949723303318024,
"learning_rate": 0.0008669295379835467,
"loss": 0.9957,
"step": 200
},
{
"epoch": 6.0,
"eval_loss": 1.6619951725006104,
"eval_runtime": 0.8305,
"eval_samples_per_second": 4.817,
"eval_steps_per_second": 1.204,
"step": 204
},
{
"epoch": 6.029411764705882,
"grad_norm": 0.07223918288946152,
"learning_rate": 0.0008162504821834296,
"loss": 0.9855,
"step": 205
},
{
"epoch": 6.176470588235294,
"grad_norm": 0.0734533816576004,
"learning_rate": 0.0007660555181983517,
"loss": 0.9822,
"step": 210
},
{
"epoch": 6.323529411764706,
"grad_norm": 0.07729926705360413,
"learning_rate": 0.0007164768856744892,
"loss": 0.9813,
"step": 215
},
{
"epoch": 6.470588235294118,
"grad_norm": 0.0778665617108345,
"learning_rate": 0.0006676452005203405,
"loss": 0.9853,
"step": 220
},
{
"epoch": 6.617647058823529,
"grad_norm": 0.06967757642269135,
"learning_rate": 0.0006196891107964744,
"loss": 0.9837,
"step": 225
},
{
"epoch": 6.764705882352941,
"grad_norm": 0.07201401889324188,
"learning_rate": 0.0005727349577896194,
"loss": 0.9823,
"step": 230
},
{
"epoch": 6.911764705882353,
"grad_norm": 0.07436826825141907,
"learning_rate": 0.00052690644316399,
"loss": 0.9793,
"step": 235
},
{
"epoch": 7.0,
"eval_loss": 1.6655919551849365,
"eval_runtime": 0.8312,
"eval_samples_per_second": 4.812,
"eval_steps_per_second": 1.203,
"step": 238
},
{
"epoch": 7.0588235294117645,
"grad_norm": 0.07127279043197632,
"learning_rate": 0.0004823243030667576,
"loss": 0.98,
"step": 240
},
{
"epoch": 7.205882352941177,
"grad_norm": 0.06584794819355011,
"learning_rate": 0.0004391059900462304,
"loss": 0.9738,
"step": 245
},
{
"epoch": 7.352941176470588,
"grad_norm": 0.06734811514616013,
"learning_rate": 0.0003973653636207437,
"loss": 0.9705,
"step": 250
},
{
"epoch": 7.5,
"grad_norm": 0.06853578239679337,
"learning_rate": 0.0003572123903134606,
"loss": 0.9778,
"step": 255
},
{
"epoch": 7.647058823529412,
"grad_norm": 0.0678030252456665,
"learning_rate": 0.0003187528539433457,
"loss": 0.9722,
"step": 260
},
{
"epoch": 7.794117647058823,
"grad_norm": 0.06483301520347595,
"learning_rate": 0.0002820880769355582,
"loss": 0.9782,
"step": 265
},
{
"epoch": 7.9411764705882355,
"grad_norm": 0.06444835662841797,
"learning_rate": 0.00024731465338547555,
"loss": 0.9761,
"step": 270
},
{
"epoch": 8.0,
"eval_loss": 1.670668125152588,
"eval_runtime": 0.8308,
"eval_samples_per_second": 4.815,
"eval_steps_per_second": 1.204,
"step": 272
},
{
"epoch": 8.088235294117647,
"grad_norm": 0.06315235048532486,
"learning_rate": 0.00021452419457960138,
"loss": 0.9718,
"step": 275
},
{
"epoch": 8.235294117647058,
"grad_norm": 0.06345119327306747,
"learning_rate": 0.0001838030876437784,
"loss": 0.977,
"step": 280
},
{
"epoch": 8.382352941176471,
"grad_norm": 0.0637635886669159,
"learning_rate": 0.00015523226795456348,
"loss": 0.9741,
"step": 285
},
{
"epoch": 8.529411764705882,
"grad_norm": 0.06295765936374664,
"learning_rate": 0.00012888700591334225,
"loss": 0.9698,
"step": 290
},
{
"epoch": 8.676470588235293,
"grad_norm": 0.06005469709634781,
"learning_rate": 0.00010483670864493777,
"loss": 0.967,
"step": 295
},
{
"epoch": 8.823529411764707,
"grad_norm": 0.060096003115177155,
"learning_rate": 8.31447371431372e-05,
"loss": 0.9651,
"step": 300
},
{
"epoch": 8.970588235294118,
"grad_norm": 0.05939820781350136,
"learning_rate": 6.386823934487617e-05,
"loss": 0.9678,
"step": 305
},
{
"epoch": 9.0,
"eval_loss": 1.6740888357162476,
"eval_runtime": 0.8293,
"eval_samples_per_second": 4.824,
"eval_steps_per_second": 1.206,
"step": 306
},
{
"epoch": 9.117647058823529,
"grad_norm": 0.058787424117326736,
"learning_rate": 4.705799957284351e-05,
"loss": 0.9715,
"step": 310
},
{
"epoch": 9.264705882352942,
"grad_norm": 0.057846549898386,
"learning_rate": 3.275830474315855e-05,
"loss": 0.9632,
"step": 315
},
{
"epoch": 9.411764705882353,
"grad_norm": 0.05796463415026665,
"learning_rate": 2.1006827690595476e-05,
"loss": 0.9622,
"step": 320
},
{
"epoch": 9.558823529411764,
"grad_norm": 0.057375218719244,
"learning_rate": 1.1834527918740623e-05,
"loss": 0.9712,
"step": 325
},
{
"epoch": 9.705882352941176,
"grad_norm": 0.05756480246782303,
"learning_rate": 5.265570036553813e-06,
"loss": 0.9672,
"step": 330
},
{
"epoch": 9.852941176470589,
"grad_norm": 0.05712839215993881,
"learning_rate": 1.3172600962190196e-06,
"loss": 0.9695,
"step": 335
},
{
"epoch": 10.0,
"grad_norm": 0.057304926216602325,
"learning_rate": 0.0,
"loss": 0.9709,
"step": 340
},
{
"epoch": 10.0,
"eval_loss": 1.6745353937149048,
"eval_runtime": 0.8292,
"eval_samples_per_second": 4.824,
"eval_steps_per_second": 1.206,
"step": 340
},
{
"epoch": 10.0,
"step": 340,
"total_flos": 1.0444708917333197e+18,
"train_loss": 1.0473914388348073,
"train_runtime": 1591.5084,
"train_samples_per_second": 108.407,
"train_steps_per_second": 0.214
}
],
"logging_steps": 5,
"max_steps": 340,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0444708917333197e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}