|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 340, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.029411764705882353, |
|
"grad_norm": 2.488327741622925, |
|
"learning_rate": 5.882352941176471e-05, |
|
"loss": 1.9016, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"grad_norm": 1.427666425704956, |
|
"learning_rate": 0.00029411764705882356, |
|
"loss": 1.8391, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.29411764705882354, |
|
"grad_norm": 0.6930230259895325, |
|
"learning_rate": 0.0005882352941176471, |
|
"loss": 1.5625, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.4411764705882353, |
|
"grad_norm": 0.2596357762813568, |
|
"learning_rate": 0.0008823529411764706, |
|
"loss": 1.3849, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.16956600546836853, |
|
"learning_rate": 0.0011764705882352942, |
|
"loss": 1.2969, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.7352941176470589, |
|
"grad_norm": 0.12523840367794037, |
|
"learning_rate": 0.0014705882352941178, |
|
"loss": 1.2405, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8823529411764706, |
|
"grad_norm": 0.09968027472496033, |
|
"learning_rate": 0.0017647058823529412, |
|
"loss": 1.2024, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.7381153106689453, |
|
"eval_runtime": 0.828, |
|
"eval_samples_per_second": 4.831, |
|
"eval_steps_per_second": 1.208, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.0294117647058822, |
|
"grad_norm": 0.366025447845459, |
|
"learning_rate": 0.001999947298487173, |
|
"loss": 1.1724, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 0.12924526631832123, |
|
"learning_rate": 0.0019981033287370443, |
|
"loss": 1.1507, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.3235294117647058, |
|
"grad_norm": 0.15898454189300537, |
|
"learning_rate": 0.0019936298356132177, |
|
"loss": 1.1306, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.4705882352941178, |
|
"grad_norm": 0.09307724982500076, |
|
"learning_rate": 0.0019865386046236597, |
|
"loss": 1.1156, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.6176470588235294, |
|
"grad_norm": 0.0881420224905014, |
|
"learning_rate": 0.001976848317759601, |
|
"loss": 1.1038, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.7647058823529411, |
|
"grad_norm": 0.08049994707107544, |
|
"learning_rate": 0.0019645845042774554, |
|
"loss": 1.0939, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.9117647058823528, |
|
"grad_norm": 0.07463372498750687, |
|
"learning_rate": 0.001949779473441478, |
|
"loss": 1.0846, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.6922645568847656, |
|
"eval_runtime": 0.8305, |
|
"eval_samples_per_second": 4.817, |
|
"eval_steps_per_second": 1.204, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.0588235294117645, |
|
"grad_norm": 0.09218032658100128, |
|
"learning_rate": 0.0019324722294043557, |
|
"loss": 1.0691, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.2058823529411766, |
|
"grad_norm": 0.09160086512565613, |
|
"learning_rate": 0.0019127083684499803, |
|
"loss": 1.0627, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 0.08734273165464401, |
|
"learning_rate": 0.0018905399588691164, |
|
"loss": 1.0554, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.16841983795166016, |
|
"learning_rate": 0.001866025403784439, |
|
"loss": 1.0504, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.6470588235294117, |
|
"grad_norm": 0.0975656732916832, |
|
"learning_rate": 0.0018392292872863268, |
|
"loss": 1.0538, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.7941176470588234, |
|
"grad_norm": 0.08383582532405853, |
|
"learning_rate": 0.0018102222042847736, |
|
"loss": 1.0443, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 0.09132234007120132, |
|
"learning_rate": 0.0017790805745256705, |
|
"loss": 1.0447, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.6731293201446533, |
|
"eval_runtime": 0.8295, |
|
"eval_samples_per_second": 4.822, |
|
"eval_steps_per_second": 1.206, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.088235294117647, |
|
"grad_norm": 0.09759514033794403, |
|
"learning_rate": 0.0017458864412614435, |
|
"loss": 1.0375, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 3.235294117647059, |
|
"grad_norm": 0.09400475025177002, |
|
"learning_rate": 0.0017107272551064472, |
|
"loss": 1.0277, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.3823529411764706, |
|
"grad_norm": 0.07706195116043091, |
|
"learning_rate": 0.0016736956436465573, |
|
"loss": 1.0239, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 3.5294117647058822, |
|
"grad_norm": 0.08647977560758591, |
|
"learning_rate": 0.0016348891674099228, |
|
"loss": 1.0229, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 3.6764705882352944, |
|
"grad_norm": 0.08289068937301636, |
|
"learning_rate": 0.0015944100628417868, |
|
"loss": 1.0238, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 3.8235294117647056, |
|
"grad_norm": 0.08053147792816162, |
|
"learning_rate": 0.0015523649729605059, |
|
"loss": 1.0195, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 3.9705882352941178, |
|
"grad_norm": 0.08788559585809708, |
|
"learning_rate": 0.001508864666404365, |
|
"loss": 1.0207, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.6660211086273193, |
|
"eval_runtime": 0.8294, |
|
"eval_samples_per_second": 4.823, |
|
"eval_steps_per_second": 1.206, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 4.117647058823529, |
|
"grad_norm": 0.0886395052075386, |
|
"learning_rate": 0.0014640237456093634, |
|
"loss": 1.007, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.264705882352941, |
|
"grad_norm": 0.08254272490739822, |
|
"learning_rate": 0.0014179603448867834, |
|
"loss": 1.0049, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 4.411764705882353, |
|
"grad_norm": 0.08430016040802002, |
|
"learning_rate": 0.0013707958191959608, |
|
"loss": 1.0103, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.5588235294117645, |
|
"grad_norm": 0.0816187709569931, |
|
"learning_rate": 0.001322654424432195, |
|
"loss": 1.0011, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 4.705882352941177, |
|
"grad_norm": 0.08980533480644226, |
|
"learning_rate": 0.0012736629900720832, |
|
"loss": 1.0107, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 4.852941176470588, |
|
"grad_norm": 0.08757297694683075, |
|
"learning_rate": 0.0012239505850387032, |
|
"loss": 1.0035, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.08535553514957428, |
|
"learning_rate": 0.0011736481776669307, |
|
"loss": 1.0039, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.668144941329956, |
|
"eval_runtime": 0.8286, |
|
"eval_samples_per_second": 4.827, |
|
"eval_steps_per_second": 1.207, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.147058823529412, |
|
"grad_norm": 0.08561732620000839, |
|
"learning_rate": 0.0011228882906647141, |
|
"loss": 0.9888, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 5.294117647058823, |
|
"grad_norm": 0.08237646520137787, |
|
"learning_rate": 0.0010718046519793277, |
|
"loss": 0.994, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.4411764705882355, |
|
"grad_norm": 0.07582972943782806, |
|
"learning_rate": 0.0010205318424883906, |
|
"loss": 0.9953, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 5.588235294117647, |
|
"grad_norm": 0.07832607626914978, |
|
"learning_rate": 0.0009692049414438299, |
|
"loss": 0.991, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 5.735294117647059, |
|
"grad_norm": 0.08117620646953583, |
|
"learning_rate": 0.0009179591706028624, |
|
"loss": 0.9931, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 5.882352941176471, |
|
"grad_norm": 0.07949723303318024, |
|
"learning_rate": 0.0008669295379835467, |
|
"loss": 0.9957, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.6619951725006104, |
|
"eval_runtime": 0.8305, |
|
"eval_samples_per_second": 4.817, |
|
"eval_steps_per_second": 1.204, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 6.029411764705882, |
|
"grad_norm": 0.07223918288946152, |
|
"learning_rate": 0.0008162504821834296, |
|
"loss": 0.9855, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 6.176470588235294, |
|
"grad_norm": 0.0734533816576004, |
|
"learning_rate": 0.0007660555181983517, |
|
"loss": 0.9822, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.323529411764706, |
|
"grad_norm": 0.07729926705360413, |
|
"learning_rate": 0.0007164768856744892, |
|
"loss": 0.9813, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 6.470588235294118, |
|
"grad_norm": 0.0778665617108345, |
|
"learning_rate": 0.0006676452005203405, |
|
"loss": 0.9853, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 6.617647058823529, |
|
"grad_norm": 0.06967757642269135, |
|
"learning_rate": 0.0006196891107964744, |
|
"loss": 0.9837, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 6.764705882352941, |
|
"grad_norm": 0.07201401889324188, |
|
"learning_rate": 0.0005727349577896194, |
|
"loss": 0.9823, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 6.911764705882353, |
|
"grad_norm": 0.07436826825141907, |
|
"learning_rate": 0.00052690644316399, |
|
"loss": 0.9793, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 1.6655919551849365, |
|
"eval_runtime": 0.8312, |
|
"eval_samples_per_second": 4.812, |
|
"eval_steps_per_second": 1.203, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 7.0588235294117645, |
|
"grad_norm": 0.07127279043197632, |
|
"learning_rate": 0.0004823243030667576, |
|
"loss": 0.98, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.205882352941177, |
|
"grad_norm": 0.06584794819355011, |
|
"learning_rate": 0.0004391059900462304, |
|
"loss": 0.9738, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 7.352941176470588, |
|
"grad_norm": 0.06734811514616013, |
|
"learning_rate": 0.0003973653636207437, |
|
"loss": 0.9705, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.06853578239679337, |
|
"learning_rate": 0.0003572123903134606, |
|
"loss": 0.9778, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 7.647058823529412, |
|
"grad_norm": 0.0678030252456665, |
|
"learning_rate": 0.0003187528539433457, |
|
"loss": 0.9722, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 7.794117647058823, |
|
"grad_norm": 0.06483301520347595, |
|
"learning_rate": 0.0002820880769355582, |
|
"loss": 0.9782, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 7.9411764705882355, |
|
"grad_norm": 0.06444835662841797, |
|
"learning_rate": 0.00024731465338547555, |
|
"loss": 0.9761, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.670668125152588, |
|
"eval_runtime": 0.8308, |
|
"eval_samples_per_second": 4.815, |
|
"eval_steps_per_second": 1.204, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 8.088235294117647, |
|
"grad_norm": 0.06315235048532486, |
|
"learning_rate": 0.00021452419457960138, |
|
"loss": 0.9718, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 8.235294117647058, |
|
"grad_norm": 0.06345119327306747, |
|
"learning_rate": 0.0001838030876437784, |
|
"loss": 0.977, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.382352941176471, |
|
"grad_norm": 0.0637635886669159, |
|
"learning_rate": 0.00015523226795456348, |
|
"loss": 0.9741, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 8.529411764705882, |
|
"grad_norm": 0.06295765936374664, |
|
"learning_rate": 0.00012888700591334225, |
|
"loss": 0.9698, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 8.676470588235293, |
|
"grad_norm": 0.06005469709634781, |
|
"learning_rate": 0.00010483670864493777, |
|
"loss": 0.967, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 8.823529411764707, |
|
"grad_norm": 0.060096003115177155, |
|
"learning_rate": 8.31447371431372e-05, |
|
"loss": 0.9651, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 8.970588235294118, |
|
"grad_norm": 0.05939820781350136, |
|
"learning_rate": 6.386823934487617e-05, |
|
"loss": 0.9678, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 1.6740888357162476, |
|
"eval_runtime": 0.8293, |
|
"eval_samples_per_second": 4.824, |
|
"eval_steps_per_second": 1.206, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 9.117647058823529, |
|
"grad_norm": 0.058787424117326736, |
|
"learning_rate": 4.705799957284351e-05, |
|
"loss": 0.9715, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 9.264705882352942, |
|
"grad_norm": 0.057846549898386, |
|
"learning_rate": 3.275830474315855e-05, |
|
"loss": 0.9632, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 9.411764705882353, |
|
"grad_norm": 0.05796463415026665, |
|
"learning_rate": 2.1006827690595476e-05, |
|
"loss": 0.9622, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 9.558823529411764, |
|
"grad_norm": 0.057375218719244, |
|
"learning_rate": 1.1834527918740623e-05, |
|
"loss": 0.9712, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 9.705882352941176, |
|
"grad_norm": 0.05756480246782303, |
|
"learning_rate": 5.265570036553813e-06, |
|
"loss": 0.9672, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 9.852941176470589, |
|
"grad_norm": 0.05712839215993881, |
|
"learning_rate": 1.3172600962190196e-06, |
|
"loss": 0.9695, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.057304926216602325, |
|
"learning_rate": 0.0, |
|
"loss": 0.9709, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 1.6745353937149048, |
|
"eval_runtime": 0.8292, |
|
"eval_samples_per_second": 4.824, |
|
"eval_steps_per_second": 1.206, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 340, |
|
"total_flos": 1.0444708917333197e+18, |
|
"train_loss": 1.0473914388348073, |
|
"train_runtime": 1591.5084, |
|
"train_samples_per_second": 108.407, |
|
"train_steps_per_second": 0.214 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 340, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0444708917333197e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|