Training in progress, step 2600, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 2066752
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f7029e76d2330b8c1ac76f27d2827fd53c1fb9a09bfd163aeb48e6ef056512d
|
3 |
size 2066752
|
last-checkpoint/optimizer.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4121235
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ab2bfc3184fdccd803bb517356e783cb35e80aca85ffa6d029528770eb4cd07
|
3 |
size 4121235
|
last-checkpoint/rng_state.pth
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 14391
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cbe8803c48cf63f4eea1ebb748b4c2beb1d95a5bd75f9d32b496d3b2c5d0dd4e
|
3 |
size 14391
|
last-checkpoint/scheduler.pt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1401
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9c68a3dab5d287edad29b6a8ea33c1819f3635324de902c425ef8f164d34fb46
|
3 |
size 1401
|
last-checkpoint/trainer_state.json
CHANGED
@@ -2,9 +2,9 @@
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
-
"epoch": 0.
|
6 |
"eval_steps": 100,
|
7 |
-
"global_step":
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
@@ -17708,6 +17708,714 @@
|
|
17708 |
"eval_samples_per_second": 1.114,
|
17709 |
"eval_steps_per_second": 0.139,
|
17710 |
"step": 2500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17711 |
}
|
17712 |
],
|
17713 |
"logging_steps": 1,
|
@@ -17727,7 +18435,7 @@
|
|
17727 |
"attributes": {}
|
17728 |
}
|
17729 |
},
|
17730 |
-
"total_flos":
|
17731 |
"train_batch_size": 1,
|
17732 |
"trial_name": null,
|
17733 |
"trial_params": null
|
|
|
2 |
"best_global_step": null,
|
3 |
"best_metric": null,
|
4 |
"best_model_checkpoint": null,
|
5 |
+
"epoch": 0.11229161268031441,
|
6 |
"eval_steps": 100,
|
7 |
+
"global_step": 2600,
|
8 |
"is_hyper_param_search": false,
|
9 |
"is_local_process_zero": true,
|
10 |
"is_world_process_zero": true,
|
|
|
17708 |
"eval_samples_per_second": 1.114,
|
17709 |
"eval_steps_per_second": 0.139,
|
17710 |
"step": 2500
|
17711 |
+
},
|
17712 |
+
{
|
17713 |
+
"epoch": 0.10801589358210245,
|
17714 |
+
"grad_norm": 0.478515625,
|
17715 |
+
"learning_rate": 0.0009908435451512379,
|
17716 |
+
"loss": 8.5773,
|
17717 |
+
"step": 2501
|
17718 |
+
},
|
17719 |
+
{
|
17720 |
+
"epoch": 0.10805908266390256,
|
17721 |
+
"grad_norm": 0.62109375,
|
17722 |
+
"learning_rate": 0.000990829935964463,
|
17723 |
+
"loss": 8.0601,
|
17724 |
+
"step": 2502
|
17725 |
+
},
|
17726 |
+
{
|
17727 |
+
"epoch": 0.10810227174570268,
|
17728 |
+
"grad_norm": 0.61328125,
|
17729 |
+
"learning_rate": 0.0009908163167651686,
|
17730 |
+
"loss": 8.3679,
|
17731 |
+
"step": 2503
|
17732 |
+
},
|
17733 |
+
{
|
17734 |
+
"epoch": 0.1081454608275028,
|
17735 |
+
"grad_norm": 0.87890625,
|
17736 |
+
"learning_rate": 0.000990802687553633,
|
17737 |
+
"loss": 7.7936,
|
17738 |
+
"step": 2504
|
17739 |
+
},
|
17740 |
+
{
|
17741 |
+
"epoch": 0.10818864990930292,
|
17742 |
+
"grad_norm": 0.458984375,
|
17743 |
+
"learning_rate": 0.000990789048330134,
|
17744 |
+
"loss": 8.268,
|
17745 |
+
"step": 2505
|
17746 |
+
},
|
17747 |
+
{
|
17748 |
+
"epoch": 0.10823183899110304,
|
17749 |
+
"grad_norm": 0.875,
|
17750 |
+
"learning_rate": 0.0009907753990949495,
|
17751 |
+
"loss": 8.4076,
|
17752 |
+
"step": 2506
|
17753 |
+
},
|
17754 |
+
{
|
17755 |
+
"epoch": 0.10827502807290316,
|
17756 |
+
"grad_norm": 0.5703125,
|
17757 |
+
"learning_rate": 0.0009907617398483583,
|
17758 |
+
"loss": 8.0655,
|
17759 |
+
"step": 2507
|
17760 |
+
},
|
17761 |
+
{
|
17762 |
+
"epoch": 0.10831821715470329,
|
17763 |
+
"grad_norm": 0.55078125,
|
17764 |
+
"learning_rate": 0.0009907480705906393,
|
17765 |
+
"loss": 8.4049,
|
17766 |
+
"step": 2508
|
17767 |
+
},
|
17768 |
+
{
|
17769 |
+
"epoch": 0.10836140623650341,
|
17770 |
+
"grad_norm": 0.5859375,
|
17771 |
+
"learning_rate": 0.0009907343913220707,
|
17772 |
+
"loss": 8.2208,
|
17773 |
+
"step": 2509
|
17774 |
+
},
|
17775 |
+
{
|
17776 |
+
"epoch": 0.10840459531830353,
|
17777 |
+
"grad_norm": 0.859375,
|
17778 |
+
"learning_rate": 0.0009907207020429319,
|
17779 |
+
"loss": 7.9682,
|
17780 |
+
"step": 2510
|
17781 |
+
},
|
17782 |
+
{
|
17783 |
+
"epoch": 0.10844778440010365,
|
17784 |
+
"grad_norm": 0.46484375,
|
17785 |
+
"learning_rate": 0.000990707002753502,
|
17786 |
+
"loss": 8.4395,
|
17787 |
+
"step": 2511
|
17788 |
+
},
|
17789 |
+
{
|
17790 |
+
"epoch": 0.10849097348190377,
|
17791 |
+
"grad_norm": 0.546875,
|
17792 |
+
"learning_rate": 0.0009906932934540607,
|
17793 |
+
"loss": 8.1248,
|
17794 |
+
"step": 2512
|
17795 |
+
},
|
17796 |
+
{
|
17797 |
+
"epoch": 0.10853416256370389,
|
17798 |
+
"grad_norm": 0.455078125,
|
17799 |
+
"learning_rate": 0.0009906795741448876,
|
17800 |
+
"loss": 8.3858,
|
17801 |
+
"step": 2513
|
17802 |
+
},
|
17803 |
+
{
|
17804 |
+
"epoch": 0.10857735164550401,
|
17805 |
+
"grad_norm": 0.59375,
|
17806 |
+
"learning_rate": 0.0009906658448262623,
|
17807 |
+
"loss": 8.2592,
|
17808 |
+
"step": 2514
|
17809 |
+
},
|
17810 |
+
{
|
17811 |
+
"epoch": 0.10862054072730414,
|
17812 |
+
"grad_norm": 0.47265625,
|
17813 |
+
"learning_rate": 0.000990652105498465,
|
17814 |
+
"loss": 8.3764,
|
17815 |
+
"step": 2515
|
17816 |
+
},
|
17817 |
+
{
|
17818 |
+
"epoch": 0.10866372980910426,
|
17819 |
+
"grad_norm": 1.703125,
|
17820 |
+
"learning_rate": 0.0009906383561617761,
|
17821 |
+
"loss": 7.9524,
|
17822 |
+
"step": 2516
|
17823 |
+
},
|
17824 |
+
{
|
17825 |
+
"epoch": 0.10870691889090438,
|
17826 |
+
"grad_norm": 0.6796875,
|
17827 |
+
"learning_rate": 0.000990624596816476,
|
17828 |
+
"loss": 8.2658,
|
17829 |
+
"step": 2517
|
17830 |
+
},
|
17831 |
+
{
|
17832 |
+
"epoch": 0.1087501079727045,
|
17833 |
+
"grad_norm": 0.76171875,
|
17834 |
+
"learning_rate": 0.0009906108274628455,
|
17835 |
+
"loss": 8.1708,
|
17836 |
+
"step": 2518
|
17837 |
+
},
|
17838 |
+
{
|
17839 |
+
"epoch": 0.10879329705450462,
|
17840 |
+
"grad_norm": 0.478515625,
|
17841 |
+
"learning_rate": 0.0009905970481011652,
|
17842 |
+
"loss": 8.3299,
|
17843 |
+
"step": 2519
|
17844 |
+
},
|
17845 |
+
{
|
17846 |
+
"epoch": 0.10883648613630474,
|
17847 |
+
"grad_norm": 0.55859375,
|
17848 |
+
"learning_rate": 0.0009905832587317163,
|
17849 |
+
"loss": 8.2721,
|
17850 |
+
"step": 2520
|
17851 |
+
},
|
17852 |
+
{
|
17853 |
+
"epoch": 0.10887967521810486,
|
17854 |
+
"grad_norm": 0.5234375,
|
17855 |
+
"learning_rate": 0.0009905694593547803,
|
17856 |
+
"loss": 8.2087,
|
17857 |
+
"step": 2521
|
17858 |
+
},
|
17859 |
+
{
|
17860 |
+
"epoch": 0.10892286429990498,
|
17861 |
+
"grad_norm": 0.69921875,
|
17862 |
+
"learning_rate": 0.0009905556499706382,
|
17863 |
+
"loss": 8.4273,
|
17864 |
+
"step": 2522
|
17865 |
+
},
|
17866 |
+
{
|
17867 |
+
"epoch": 0.1089660533817051,
|
17868 |
+
"grad_norm": 0.578125,
|
17869 |
+
"learning_rate": 0.0009905418305795723,
|
17870 |
+
"loss": 8.2658,
|
17871 |
+
"step": 2523
|
17872 |
+
},
|
17873 |
+
{
|
17874 |
+
"epoch": 0.10900924246350523,
|
17875 |
+
"grad_norm": 0.55859375,
|
17876 |
+
"learning_rate": 0.0009905280011818644,
|
17877 |
+
"loss": 7.9867,
|
17878 |
+
"step": 2524
|
17879 |
+
},
|
17880 |
+
{
|
17881 |
+
"epoch": 0.10905243154530535,
|
17882 |
+
"grad_norm": 0.46484375,
|
17883 |
+
"learning_rate": 0.000990514161777796,
|
17884 |
+
"loss": 8.4169,
|
17885 |
+
"step": 2525
|
17886 |
+
},
|
17887 |
+
{
|
17888 |
+
"epoch": 0.10909562062710547,
|
17889 |
+
"grad_norm": 0.40234375,
|
17890 |
+
"learning_rate": 0.0009905003123676503,
|
17891 |
+
"loss": 8.4543,
|
17892 |
+
"step": 2526
|
17893 |
+
},
|
17894 |
+
{
|
17895 |
+
"epoch": 0.10913880970890559,
|
17896 |
+
"grad_norm": 0.66796875,
|
17897 |
+
"learning_rate": 0.000990486452951709,
|
17898 |
+
"loss": 8.0104,
|
17899 |
+
"step": 2527
|
17900 |
+
},
|
17901 |
+
{
|
17902 |
+
"epoch": 0.10918199879070571,
|
17903 |
+
"grad_norm": 0.6171875,
|
17904 |
+
"learning_rate": 0.0009904725835302552,
|
17905 |
+
"loss": 8.2101,
|
17906 |
+
"step": 2528
|
17907 |
+
},
|
17908 |
+
{
|
17909 |
+
"epoch": 0.10922518787250583,
|
17910 |
+
"grad_norm": 0.57421875,
|
17911 |
+
"learning_rate": 0.000990458704103572,
|
17912 |
+
"loss": 8.1934,
|
17913 |
+
"step": 2529
|
17914 |
+
},
|
17915 |
+
{
|
17916 |
+
"epoch": 0.10926837695430595,
|
17917 |
+
"grad_norm": 0.59375,
|
17918 |
+
"learning_rate": 0.0009904448146719421,
|
17919 |
+
"loss": 8.4709,
|
17920 |
+
"step": 2530
|
17921 |
+
},
|
17922 |
+
{
|
17923 |
+
"epoch": 0.10931156603610608,
|
17924 |
+
"grad_norm": 0.5390625,
|
17925 |
+
"learning_rate": 0.0009904309152356495,
|
17926 |
+
"loss": 8.3082,
|
17927 |
+
"step": 2531
|
17928 |
+
},
|
17929 |
+
{
|
17930 |
+
"epoch": 0.1093547551179062,
|
17931 |
+
"grad_norm": 0.47265625,
|
17932 |
+
"learning_rate": 0.0009904170057949769,
|
17933 |
+
"loss": 8.3866,
|
17934 |
+
"step": 2532
|
17935 |
+
},
|
17936 |
+
{
|
17937 |
+
"epoch": 0.10939794419970632,
|
17938 |
+
"grad_norm": 1.0703125,
|
17939 |
+
"learning_rate": 0.0009904030863502086,
|
17940 |
+
"loss": 8.956,
|
17941 |
+
"step": 2533
|
17942 |
+
},
|
17943 |
+
{
|
17944 |
+
"epoch": 0.10944113328150644,
|
17945 |
+
"grad_norm": 0.57421875,
|
17946 |
+
"learning_rate": 0.0009903891569016283,
|
17947 |
+
"loss": 8.1289,
|
17948 |
+
"step": 2534
|
17949 |
+
},
|
17950 |
+
{
|
17951 |
+
"epoch": 0.10948432236330656,
|
17952 |
+
"grad_norm": 0.46875,
|
17953 |
+
"learning_rate": 0.0009903752174495203,
|
17954 |
+
"loss": 8.267,
|
17955 |
+
"step": 2535
|
17956 |
+
},
|
17957 |
+
{
|
17958 |
+
"epoch": 0.10952751144510668,
|
17959 |
+
"grad_norm": 0.5078125,
|
17960 |
+
"learning_rate": 0.000990361267994169,
|
17961 |
+
"loss": 8.279,
|
17962 |
+
"step": 2536
|
17963 |
+
},
|
17964 |
+
{
|
17965 |
+
"epoch": 0.1095707005269068,
|
17966 |
+
"grad_norm": 0.75390625,
|
17967 |
+
"learning_rate": 0.0009903473085358587,
|
17968 |
+
"loss": 8.0143,
|
17969 |
+
"step": 2537
|
17970 |
+
},
|
17971 |
+
{
|
17972 |
+
"epoch": 0.10961388960870692,
|
17973 |
+
"grad_norm": 0.42578125,
|
17974 |
+
"learning_rate": 0.0009903333390748747,
|
17975 |
+
"loss": 8.196,
|
17976 |
+
"step": 2538
|
17977 |
+
},
|
17978 |
+
{
|
17979 |
+
"epoch": 0.10965707869050705,
|
17980 |
+
"grad_norm": 0.56640625,
|
17981 |
+
"learning_rate": 0.000990319359611501,
|
17982 |
+
"loss": 8.4366,
|
17983 |
+
"step": 2539
|
17984 |
+
},
|
17985 |
+
{
|
17986 |
+
"epoch": 0.10970026777230717,
|
17987 |
+
"grad_norm": 0.5,
|
17988 |
+
"learning_rate": 0.0009903053701460236,
|
17989 |
+
"loss": 8.6248,
|
17990 |
+
"step": 2540
|
17991 |
+
},
|
17992 |
+
{
|
17993 |
+
"epoch": 0.10974345685410729,
|
17994 |
+
"grad_norm": 0.5390625,
|
17995 |
+
"learning_rate": 0.0009902913706787279,
|
17996 |
+
"loss": 8.3567,
|
17997 |
+
"step": 2541
|
17998 |
+
},
|
17999 |
+
{
|
18000 |
+
"epoch": 0.10978664593590741,
|
18001 |
+
"grad_norm": 0.6484375,
|
18002 |
+
"learning_rate": 0.0009902773612098987,
|
18003 |
+
"loss": 8.1894,
|
18004 |
+
"step": 2542
|
18005 |
+
},
|
18006 |
+
{
|
18007 |
+
"epoch": 0.10982983501770752,
|
18008 |
+
"grad_norm": 0.640625,
|
18009 |
+
"learning_rate": 0.0009902633417398225,
|
18010 |
+
"loss": 8.4882,
|
18011 |
+
"step": 2543
|
18012 |
+
},
|
18013 |
+
{
|
18014 |
+
"epoch": 0.10987302409950764,
|
18015 |
+
"grad_norm": 0.640625,
|
18016 |
+
"learning_rate": 0.0009902493122687852,
|
18017 |
+
"loss": 7.9297,
|
18018 |
+
"step": 2544
|
18019 |
+
},
|
18020 |
+
{
|
18021 |
+
"epoch": 0.10991621318130776,
|
18022 |
+
"grad_norm": 0.796875,
|
18023 |
+
"learning_rate": 0.0009902352727970728,
|
18024 |
+
"loss": 8.1606,
|
18025 |
+
"step": 2545
|
18026 |
+
},
|
18027 |
+
{
|
18028 |
+
"epoch": 0.10995940226310788,
|
18029 |
+
"grad_norm": 0.60546875,
|
18030 |
+
"learning_rate": 0.0009902212233249717,
|
18031 |
+
"loss": 8.3975,
|
18032 |
+
"step": 2546
|
18033 |
+
},
|
18034 |
+
{
|
18035 |
+
"epoch": 0.110002591344908,
|
18036 |
+
"grad_norm": 0.48046875,
|
18037 |
+
"learning_rate": 0.0009902071638527685,
|
18038 |
+
"loss": 8.5701,
|
18039 |
+
"step": 2547
|
18040 |
+
},
|
18041 |
+
{
|
18042 |
+
"epoch": 0.11004578042670812,
|
18043 |
+
"grad_norm": 0.490234375,
|
18044 |
+
"learning_rate": 0.0009901930943807503,
|
18045 |
+
"loss": 8.1647,
|
18046 |
+
"step": 2548
|
18047 |
+
},
|
18048 |
+
{
|
18049 |
+
"epoch": 0.11008896950850824,
|
18050 |
+
"grad_norm": 0.57421875,
|
18051 |
+
"learning_rate": 0.0009901790149092035,
|
18052 |
+
"loss": 8.1903,
|
18053 |
+
"step": 2549
|
18054 |
+
},
|
18055 |
+
{
|
18056 |
+
"epoch": 0.11013215859030837,
|
18057 |
+
"grad_norm": 0.79296875,
|
18058 |
+
"learning_rate": 0.0009901649254384158,
|
18059 |
+
"loss": 7.8448,
|
18060 |
+
"step": 2550
|
18061 |
+
},
|
18062 |
+
{
|
18063 |
+
"epoch": 0.11017534767210849,
|
18064 |
+
"grad_norm": 0.62109375,
|
18065 |
+
"learning_rate": 0.0009901508259686745,
|
18066 |
+
"loss": 8.0894,
|
18067 |
+
"step": 2551
|
18068 |
+
},
|
18069 |
+
{
|
18070 |
+
"epoch": 0.11021853675390861,
|
18071 |
+
"grad_norm": 0.58203125,
|
18072 |
+
"learning_rate": 0.0009901367165002673,
|
18073 |
+
"loss": 8.2087,
|
18074 |
+
"step": 2552
|
18075 |
+
},
|
18076 |
+
{
|
18077 |
+
"epoch": 0.11026172583570873,
|
18078 |
+
"grad_norm": 0.51171875,
|
18079 |
+
"learning_rate": 0.0009901225970334816,
|
18080 |
+
"loss": 8.2029,
|
18081 |
+
"step": 2553
|
18082 |
+
},
|
18083 |
+
{
|
18084 |
+
"epoch": 0.11030491491750885,
|
18085 |
+
"grad_norm": 0.5703125,
|
18086 |
+
"learning_rate": 0.0009901084675686062,
|
18087 |
+
"loss": 8.4018,
|
18088 |
+
"step": 2554
|
18089 |
+
},
|
18090 |
+
{
|
18091 |
+
"epoch": 0.11034810399930897,
|
18092 |
+
"grad_norm": 0.5234375,
|
18093 |
+
"learning_rate": 0.0009900943281059287,
|
18094 |
+
"loss": 8.3994,
|
18095 |
+
"step": 2555
|
18096 |
+
},
|
18097 |
+
{
|
18098 |
+
"epoch": 0.1103912930811091,
|
18099 |
+
"grad_norm": 0.609375,
|
18100 |
+
"learning_rate": 0.0009900801786457375,
|
18101 |
+
"loss": 8.138,
|
18102 |
+
"step": 2556
|
18103 |
+
},
|
18104 |
+
{
|
18105 |
+
"epoch": 0.11043448216290921,
|
18106 |
+
"grad_norm": 0.5234375,
|
18107 |
+
"learning_rate": 0.0009900660191883217,
|
18108 |
+
"loss": 8.1755,
|
18109 |
+
"step": 2557
|
18110 |
+
},
|
18111 |
+
{
|
18112 |
+
"epoch": 0.11047767124470934,
|
18113 |
+
"grad_norm": 0.69921875,
|
18114 |
+
"learning_rate": 0.0009900518497339696,
|
18115 |
+
"loss": 8.2262,
|
18116 |
+
"step": 2558
|
18117 |
+
},
|
18118 |
+
{
|
18119 |
+
"epoch": 0.11052086032650946,
|
18120 |
+
"grad_norm": 0.53125,
|
18121 |
+
"learning_rate": 0.0009900376702829707,
|
18122 |
+
"loss": 8.4282,
|
18123 |
+
"step": 2559
|
18124 |
+
},
|
18125 |
+
{
|
18126 |
+
"epoch": 0.11056404940830958,
|
18127 |
+
"grad_norm": 0.52734375,
|
18128 |
+
"learning_rate": 0.0009900234808356142,
|
18129 |
+
"loss": 8.3542,
|
18130 |
+
"step": 2560
|
18131 |
+
},
|
18132 |
+
{
|
18133 |
+
"epoch": 0.1106072384901097,
|
18134 |
+
"grad_norm": 0.5078125,
|
18135 |
+
"learning_rate": 0.0009900092813921893,
|
18136 |
+
"loss": 8.206,
|
18137 |
+
"step": 2561
|
18138 |
+
},
|
18139 |
+
{
|
18140 |
+
"epoch": 0.11065042757190982,
|
18141 |
+
"grad_norm": 0.515625,
|
18142 |
+
"learning_rate": 0.0009899950719529857,
|
18143 |
+
"loss": 8.3899,
|
18144 |
+
"step": 2562
|
18145 |
+
},
|
18146 |
+
{
|
18147 |
+
"epoch": 0.11069361665370994,
|
18148 |
+
"grad_norm": 0.478515625,
|
18149 |
+
"learning_rate": 0.0009899808525182935,
|
18150 |
+
"loss": 8.4189,
|
18151 |
+
"step": 2563
|
18152 |
+
},
|
18153 |
+
{
|
18154 |
+
"epoch": 0.11073680573551006,
|
18155 |
+
"grad_norm": 0.58203125,
|
18156 |
+
"learning_rate": 0.0009899666230884024,
|
18157 |
+
"loss": 8.3508,
|
18158 |
+
"step": 2564
|
18159 |
+
},
|
18160 |
+
{
|
18161 |
+
"epoch": 0.11077999481731018,
|
18162 |
+
"grad_norm": 0.5859375,
|
18163 |
+
"learning_rate": 0.0009899523836636032,
|
18164 |
+
"loss": 8.0618,
|
18165 |
+
"step": 2565
|
18166 |
+
},
|
18167 |
+
{
|
18168 |
+
"epoch": 0.1108231838991103,
|
18169 |
+
"grad_norm": 0.6171875,
|
18170 |
+
"learning_rate": 0.0009899381342441857,
|
18171 |
+
"loss": 8.1894,
|
18172 |
+
"step": 2566
|
18173 |
+
},
|
18174 |
+
{
|
18175 |
+
"epoch": 0.11086637298091043,
|
18176 |
+
"grad_norm": 0.63671875,
|
18177 |
+
"learning_rate": 0.0009899238748304411,
|
18178 |
+
"loss": 8.373,
|
18179 |
+
"step": 2567
|
18180 |
+
},
|
18181 |
+
{
|
18182 |
+
"epoch": 0.11090956206271055,
|
18183 |
+
"grad_norm": 0.5234375,
|
18184 |
+
"learning_rate": 0.0009899096054226601,
|
18185 |
+
"loss": 8.3343,
|
18186 |
+
"step": 2568
|
18187 |
+
},
|
18188 |
+
{
|
18189 |
+
"epoch": 0.11095275114451067,
|
18190 |
+
"grad_norm": 0.80859375,
|
18191 |
+
"learning_rate": 0.0009898953260211339,
|
18192 |
+
"loss": 8.0553,
|
18193 |
+
"step": 2569
|
18194 |
+
},
|
18195 |
+
{
|
18196 |
+
"epoch": 0.11099594022631079,
|
18197 |
+
"grad_norm": 0.6640625,
|
18198 |
+
"learning_rate": 0.0009898810366261535,
|
18199 |
+
"loss": 8.664,
|
18200 |
+
"step": 2570
|
18201 |
+
},
|
18202 |
+
{
|
18203 |
+
"epoch": 0.11103912930811091,
|
18204 |
+
"grad_norm": 0.578125,
|
18205 |
+
"learning_rate": 0.0009898667372380107,
|
18206 |
+
"loss": 8.3618,
|
18207 |
+
"step": 2571
|
18208 |
+
},
|
18209 |
+
{
|
18210 |
+
"epoch": 0.11108231838991103,
|
18211 |
+
"grad_norm": 0.49609375,
|
18212 |
+
"learning_rate": 0.000989852427856997,
|
18213 |
+
"loss": 8.279,
|
18214 |
+
"step": 2572
|
18215 |
+
},
|
18216 |
+
{
|
18217 |
+
"epoch": 0.11112550747171115,
|
18218 |
+
"grad_norm": 0.6328125,
|
18219 |
+
"learning_rate": 0.0009898381084834044,
|
18220 |
+
"loss": 8.2579,
|
18221 |
+
"step": 2573
|
18222 |
+
},
|
18223 |
+
{
|
18224 |
+
"epoch": 0.11116869655351128,
|
18225 |
+
"grad_norm": 0.494140625,
|
18226 |
+
"learning_rate": 0.000989823779117525,
|
18227 |
+
"loss": 8.2548,
|
18228 |
+
"step": 2574
|
18229 |
+
},
|
18230 |
+
{
|
18231 |
+
"epoch": 0.1112118856353114,
|
18232 |
+
"grad_norm": 1.15625,
|
18233 |
+
"learning_rate": 0.000989809439759651,
|
18234 |
+
"loss": 8.7369,
|
18235 |
+
"step": 2575
|
18236 |
+
},
|
18237 |
+
{
|
18238 |
+
"epoch": 0.11125507471711152,
|
18239 |
+
"grad_norm": 0.6171875,
|
18240 |
+
"learning_rate": 0.000989795090410075,
|
18241 |
+
"loss": 7.8746,
|
18242 |
+
"step": 2576
|
18243 |
+
},
|
18244 |
+
{
|
18245 |
+
"epoch": 0.11129826379891164,
|
18246 |
+
"grad_norm": 0.7890625,
|
18247 |
+
"learning_rate": 0.0009897807310690898,
|
18248 |
+
"loss": 8.6083,
|
18249 |
+
"step": 2577
|
18250 |
+
},
|
18251 |
+
{
|
18252 |
+
"epoch": 0.11134145288071176,
|
18253 |
+
"grad_norm": 0.640625,
|
18254 |
+
"learning_rate": 0.000989766361736988,
|
18255 |
+
"loss": 8.4279,
|
18256 |
+
"step": 2578
|
18257 |
+
},
|
18258 |
+
{
|
18259 |
+
"epoch": 0.11138464196251188,
|
18260 |
+
"grad_norm": 1.1015625,
|
18261 |
+
"learning_rate": 0.0009897519824140632,
|
18262 |
+
"loss": 8.1736,
|
18263 |
+
"step": 2579
|
18264 |
+
},
|
18265 |
+
{
|
18266 |
+
"epoch": 0.111427831044312,
|
18267 |
+
"grad_norm": 0.734375,
|
18268 |
+
"learning_rate": 0.0009897375931006082,
|
18269 |
+
"loss": 8.6102,
|
18270 |
+
"step": 2580
|
18271 |
+
},
|
18272 |
+
{
|
18273 |
+
"epoch": 0.11147102012611212,
|
18274 |
+
"grad_norm": 0.80078125,
|
18275 |
+
"learning_rate": 0.0009897231937969172,
|
18276 |
+
"loss": 7.9732,
|
18277 |
+
"step": 2581
|
18278 |
+
},
|
18279 |
+
{
|
18280 |
+
"epoch": 0.11151420920791225,
|
18281 |
+
"grad_norm": 0.47265625,
|
18282 |
+
"learning_rate": 0.0009897087845032832,
|
18283 |
+
"loss": 8.2183,
|
18284 |
+
"step": 2582
|
18285 |
+
},
|
18286 |
+
{
|
18287 |
+
"epoch": 0.11155739828971237,
|
18288 |
+
"grad_norm": 0.46875,
|
18289 |
+
"learning_rate": 0.0009896943652200005,
|
18290 |
+
"loss": 8.4685,
|
18291 |
+
"step": 2583
|
18292 |
+
},
|
18293 |
+
{
|
18294 |
+
"epoch": 0.11160058737151249,
|
18295 |
+
"grad_norm": 0.5859375,
|
18296 |
+
"learning_rate": 0.0009896799359473635,
|
18297 |
+
"loss": 8.4296,
|
18298 |
+
"step": 2584
|
18299 |
+
},
|
18300 |
+
{
|
18301 |
+
"epoch": 0.1116437764533126,
|
18302 |
+
"grad_norm": 0.474609375,
|
18303 |
+
"learning_rate": 0.000989665496685666,
|
18304 |
+
"loss": 8.4101,
|
18305 |
+
"step": 2585
|
18306 |
+
},
|
18307 |
+
{
|
18308 |
+
"epoch": 0.11168696553511272,
|
18309 |
+
"grad_norm": 0.451171875,
|
18310 |
+
"learning_rate": 0.0009896510474352027,
|
18311 |
+
"loss": 8.5164,
|
18312 |
+
"step": 2586
|
18313 |
+
},
|
18314 |
+
{
|
18315 |
+
"epoch": 0.11173015461691284,
|
18316 |
+
"grad_norm": 0.55078125,
|
18317 |
+
"learning_rate": 0.0009896365881962685,
|
18318 |
+
"loss": 8.3774,
|
18319 |
+
"step": 2587
|
18320 |
+
},
|
18321 |
+
{
|
18322 |
+
"epoch": 0.11177334369871296,
|
18323 |
+
"grad_norm": 0.54296875,
|
18324 |
+
"learning_rate": 0.0009896221189691586,
|
18325 |
+
"loss": 8.139,
|
18326 |
+
"step": 2588
|
18327 |
+
},
|
18328 |
+
{
|
18329 |
+
"epoch": 0.11181653278051308,
|
18330 |
+
"grad_norm": 0.65234375,
|
18331 |
+
"learning_rate": 0.0009896076397541676,
|
18332 |
+
"loss": 8.3887,
|
18333 |
+
"step": 2589
|
18334 |
+
},
|
18335 |
+
{
|
18336 |
+
"epoch": 0.1118597218623132,
|
18337 |
+
"grad_norm": 0.53515625,
|
18338 |
+
"learning_rate": 0.0009895931505515914,
|
18339 |
+
"loss": 8.2581,
|
18340 |
+
"step": 2590
|
18341 |
+
},
|
18342 |
+
{
|
18343 |
+
"epoch": 0.11190291094411332,
|
18344 |
+
"grad_norm": 0.5546875,
|
18345 |
+
"learning_rate": 0.000989578651361725,
|
18346 |
+
"loss": 8.1766,
|
18347 |
+
"step": 2591
|
18348 |
+
},
|
18349 |
+
{
|
18350 |
+
"epoch": 0.11194610002591344,
|
18351 |
+
"grad_norm": 0.6171875,
|
18352 |
+
"learning_rate": 0.0009895641421848646,
|
18353 |
+
"loss": 8.399,
|
18354 |
+
"step": 2592
|
18355 |
+
},
|
18356 |
+
{
|
18357 |
+
"epoch": 0.11198928910771357,
|
18358 |
+
"grad_norm": 0.6328125,
|
18359 |
+
"learning_rate": 0.000989549623021306,
|
18360 |
+
"loss": 8.3802,
|
18361 |
+
"step": 2593
|
18362 |
+
},
|
18363 |
+
{
|
18364 |
+
"epoch": 0.11203247818951369,
|
18365 |
+
"grad_norm": 0.609375,
|
18366 |
+
"learning_rate": 0.0009895350938713455,
|
18367 |
+
"loss": 8.1978,
|
18368 |
+
"step": 2594
|
18369 |
+
},
|
18370 |
+
{
|
18371 |
+
"epoch": 0.11207566727131381,
|
18372 |
+
"grad_norm": 0.640625,
|
18373 |
+
"learning_rate": 0.0009895205547352794,
|
18374 |
+
"loss": 8.2764,
|
18375 |
+
"step": 2595
|
18376 |
+
},
|
18377 |
+
{
|
18378 |
+
"epoch": 0.11211885635311393,
|
18379 |
+
"grad_norm": 0.66796875,
|
18380 |
+
"learning_rate": 0.0009895060056134045,
|
18381 |
+
"loss": 8.0888,
|
18382 |
+
"step": 2596
|
18383 |
+
},
|
18384 |
+
{
|
18385 |
+
"epoch": 0.11216204543491405,
|
18386 |
+
"grad_norm": 0.435546875,
|
18387 |
+
"learning_rate": 0.0009894914465060172,
|
18388 |
+
"loss": 8.5272,
|
18389 |
+
"step": 2597
|
18390 |
+
},
|
18391 |
+
{
|
18392 |
+
"epoch": 0.11220523451671417,
|
18393 |
+
"grad_norm": 0.55859375,
|
18394 |
+
"learning_rate": 0.0009894768774134147,
|
18395 |
+
"loss": 8.2076,
|
18396 |
+
"step": 2598
|
18397 |
+
},
|
18398 |
+
{
|
18399 |
+
"epoch": 0.1122484235985143,
|
18400 |
+
"grad_norm": 0.52734375,
|
18401 |
+
"learning_rate": 0.0009894622983358942,
|
18402 |
+
"loss": 8.3313,
|
18403 |
+
"step": 2599
|
18404 |
+
},
|
18405 |
+
{
|
18406 |
+
"epoch": 0.11229161268031441,
|
18407 |
+
"grad_norm": 0.80078125,
|
18408 |
+
"learning_rate": 0.0009894477092737529,
|
18409 |
+
"loss": 8.4905,
|
18410 |
+
"step": 2600
|
18411 |
+
},
|
18412 |
+
{
|
18413 |
+
"epoch": 0.11229161268031441,
|
18414 |
+
"eval_loss": 8.336125373840332,
|
18415 |
+
"eval_runtime": 15.4776,
|
18416 |
+
"eval_samples_per_second": 1.551,
|
18417 |
+
"eval_steps_per_second": 0.194,
|
18418 |
+
"step": 2600
|
18419 |
}
|
18420 |
],
|
18421 |
"logging_steps": 1,
|
|
|
18435 |
"attributes": {}
|
18436 |
}
|
18437 |
},
|
18438 |
+
"total_flos": 8311033036800.0,
|
18439 |
"train_batch_size": 1,
|
18440 |
"trial_name": null,
|
18441 |
"trial_params": null
|