kiddothe2b commited on
Commit
b8dec3e
·
1 Parent(s): eda42da

Training in progress, step 19200

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:383875ed49bab0b3a07e77766efb44191fb9f1834ccf4e7c6e4692b925b1a4d5
3
  size 745634697
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb7d70d52e6f6ba52d1887214633df22322169aae41b2ab9790870f7cc9779d8
3
  size 745634697
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9161e0e47a64c5b65b5d9cdc06273c036dd388860216eaa3c16c2c8bd9536ef
3
  size 372832803
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55326f8f5fa73d8cae36ee1024b1ca50073bb54f266592786e9989158b712f06
3
  size 372832803
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:545fcebd225c2fbcaaae084db32b315ff159bcb9f66f876ced049afa99cb2632
3
  size 15523
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:284fe99a435cf0024e2dad7b8f41c11dc1317cc722e9161cea9c17f8c2b38610
3
  size 15523
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a63c18679f872f561021a84d9bfcd3fad0c807bcef87d1a807b9818f9895c1f
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12a6154fa53f0286557ec7a9b6bf6b9f5b2fb01f4345510fa7b96c5e44005857
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.2,
5
- "global_step": 12800,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -792,11 +792,404 @@
792
  "eval_samples_per_second": 36.278,
793
  "eval_steps_per_second": 2.267,
794
  "step": 12800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
  }
796
  ],
797
  "max_steps": 64000,
798
  "num_train_epochs": 9223372036854775807,
799
- "total_flos": 1.353967057502208e+17,
800
  "trial_name": null,
801
  "trial_params": null
802
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.3,
5
+ "global_step": 19200,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
792
  "eval_samples_per_second": 36.278,
793
  "eval_steps_per_second": 2.267,
794
  "step": 12800
795
+ },
796
+ {
797
+ "epoch": 0.2,
798
+ "learning_rate": 0.001,
799
+ "loss": 7.9998,
800
+ "step": 12900
801
+ },
802
+ {
803
+ "epoch": 0.2,
804
+ "learning_rate": 0.001,
805
+ "loss": 7.9955,
806
+ "step": 13000
807
+ },
808
+ {
809
+ "epoch": 0.2,
810
+ "learning_rate": 0.001,
811
+ "loss": 8.0036,
812
+ "step": 13100
813
+ },
814
+ {
815
+ "epoch": 0.21,
816
+ "learning_rate": 0.001,
817
+ "loss": 8.0021,
818
+ "step": 13200
819
+ },
820
+ {
821
+ "epoch": 0.21,
822
+ "learning_rate": 0.001,
823
+ "loss": 8.0031,
824
+ "step": 13300
825
+ },
826
+ {
827
+ "epoch": 0.21,
828
+ "learning_rate": 0.001,
829
+ "loss": 8.0115,
830
+ "step": 13400
831
+ },
832
+ {
833
+ "epoch": 0.21,
834
+ "learning_rate": 0.001,
835
+ "loss": 7.994,
836
+ "step": 13500
837
+ },
838
+ {
839
+ "epoch": 0.21,
840
+ "learning_rate": 0.001,
841
+ "loss": 8.0121,
842
+ "step": 13600
843
+ },
844
+ {
845
+ "epoch": 0.21,
846
+ "learning_rate": 0.001,
847
+ "loss": 7.9854,
848
+ "step": 13700
849
+ },
850
+ {
851
+ "epoch": 0.22,
852
+ "learning_rate": 0.001,
853
+ "loss": 7.9859,
854
+ "step": 13800
855
+ },
856
+ {
857
+ "epoch": 0.22,
858
+ "learning_rate": 0.001,
859
+ "loss": 8.0162,
860
+ "step": 13900
861
+ },
862
+ {
863
+ "epoch": 0.22,
864
+ "learning_rate": 0.001,
865
+ "loss": 7.9942,
866
+ "step": 14000
867
+ },
868
+ {
869
+ "epoch": 0.22,
870
+ "learning_rate": 0.001,
871
+ "loss": 8.0182,
872
+ "step": 14100
873
+ },
874
+ {
875
+ "epoch": 0.22,
876
+ "learning_rate": 0.001,
877
+ "loss": 8.0064,
878
+ "step": 14200
879
+ },
880
+ {
881
+ "epoch": 0.22,
882
+ "learning_rate": 0.001,
883
+ "loss": 8.0209,
884
+ "step": 14300
885
+ },
886
+ {
887
+ "epoch": 0.23,
888
+ "learning_rate": 0.001,
889
+ "loss": 7.9981,
890
+ "step": 14400
891
+ },
892
+ {
893
+ "epoch": 0.23,
894
+ "learning_rate": 0.001,
895
+ "loss": 7.988,
896
+ "step": 14500
897
+ },
898
+ {
899
+ "epoch": 0.23,
900
+ "learning_rate": 0.001,
901
+ "loss": 7.9834,
902
+ "step": 14600
903
+ },
904
+ {
905
+ "epoch": 0.23,
906
+ "learning_rate": 0.001,
907
+ "loss": 8.0015,
908
+ "step": 14700
909
+ },
910
+ {
911
+ "epoch": 0.23,
912
+ "learning_rate": 0.001,
913
+ "loss": 7.9767,
914
+ "step": 14800
915
+ },
916
+ {
917
+ "epoch": 0.23,
918
+ "learning_rate": 0.001,
919
+ "loss": 8.0057,
920
+ "step": 14900
921
+ },
922
+ {
923
+ "epoch": 0.23,
924
+ "learning_rate": 0.001,
925
+ "loss": 8.0191,
926
+ "step": 15000
927
+ },
928
+ {
929
+ "epoch": 0.24,
930
+ "learning_rate": 0.001,
931
+ "loss": 7.9998,
932
+ "step": 15100
933
+ },
934
+ {
935
+ "epoch": 0.24,
936
+ "learning_rate": 0.001,
937
+ "loss": 7.9991,
938
+ "step": 15200
939
+ },
940
+ {
941
+ "epoch": 0.24,
942
+ "learning_rate": 0.001,
943
+ "loss": 8.0132,
944
+ "step": 15300
945
+ },
946
+ {
947
+ "epoch": 0.24,
948
+ "learning_rate": 0.001,
949
+ "loss": 8.0015,
950
+ "step": 15400
951
+ },
952
+ {
953
+ "epoch": 0.24,
954
+ "learning_rate": 0.001,
955
+ "loss": 8.0014,
956
+ "step": 15500
957
+ },
958
+ {
959
+ "epoch": 0.24,
960
+ "learning_rate": 0.001,
961
+ "loss": 8.0031,
962
+ "step": 15600
963
+ },
964
+ {
965
+ "epoch": 0.25,
966
+ "learning_rate": 0.001,
967
+ "loss": 7.9966,
968
+ "step": 15700
969
+ },
970
+ {
971
+ "epoch": 0.25,
972
+ "learning_rate": 0.001,
973
+ "loss": 7.9944,
974
+ "step": 15800
975
+ },
976
+ {
977
+ "epoch": 0.25,
978
+ "learning_rate": 0.001,
979
+ "loss": 8.0058,
980
+ "step": 15900
981
+ },
982
+ {
983
+ "epoch": 0.25,
984
+ "learning_rate": 0.001,
985
+ "loss": 7.99,
986
+ "step": 16000
987
+ },
988
+ {
989
+ "epoch": 0.25,
990
+ "learning_rate": 0.001,
991
+ "loss": 7.9863,
992
+ "step": 16100
993
+ },
994
+ {
995
+ "epoch": 0.25,
996
+ "learning_rate": 0.001,
997
+ "loss": 7.988,
998
+ "step": 16200
999
+ },
1000
+ {
1001
+ "epoch": 0.25,
1002
+ "learning_rate": 0.001,
1003
+ "loss": 7.988,
1004
+ "step": 16300
1005
+ },
1006
+ {
1007
+ "epoch": 0.26,
1008
+ "learning_rate": 0.001,
1009
+ "loss": 8.02,
1010
+ "step": 16400
1011
+ },
1012
+ {
1013
+ "epoch": 0.26,
1014
+ "learning_rate": 0.001,
1015
+ "loss": 7.9965,
1016
+ "step": 16500
1017
+ },
1018
+ {
1019
+ "epoch": 0.26,
1020
+ "learning_rate": 0.001,
1021
+ "loss": 7.98,
1022
+ "step": 16600
1023
+ },
1024
+ {
1025
+ "epoch": 0.26,
1026
+ "learning_rate": 0.001,
1027
+ "loss": 8.0019,
1028
+ "step": 16700
1029
+ },
1030
+ {
1031
+ "epoch": 0.26,
1032
+ "learning_rate": 0.001,
1033
+ "loss": 8.0005,
1034
+ "step": 16800
1035
+ },
1036
+ {
1037
+ "epoch": 0.26,
1038
+ "learning_rate": 0.001,
1039
+ "loss": 7.9963,
1040
+ "step": 16900
1041
+ },
1042
+ {
1043
+ "epoch": 0.27,
1044
+ "learning_rate": 0.001,
1045
+ "loss": 8.0074,
1046
+ "step": 17000
1047
+ },
1048
+ {
1049
+ "epoch": 0.27,
1050
+ "learning_rate": 0.001,
1051
+ "loss": 8.0089,
1052
+ "step": 17100
1053
+ },
1054
+ {
1055
+ "epoch": 0.27,
1056
+ "learning_rate": 0.001,
1057
+ "loss": 7.973,
1058
+ "step": 17200
1059
+ },
1060
+ {
1061
+ "epoch": 0.27,
1062
+ "learning_rate": 0.001,
1063
+ "loss": 7.9994,
1064
+ "step": 17300
1065
+ },
1066
+ {
1067
+ "epoch": 0.27,
1068
+ "learning_rate": 0.001,
1069
+ "loss": 8.0098,
1070
+ "step": 17400
1071
+ },
1072
+ {
1073
+ "epoch": 0.27,
1074
+ "learning_rate": 0.001,
1075
+ "loss": 7.9892,
1076
+ "step": 17500
1077
+ },
1078
+ {
1079
+ "epoch": 0.28,
1080
+ "learning_rate": 0.001,
1081
+ "loss": 8.0076,
1082
+ "step": 17600
1083
+ },
1084
+ {
1085
+ "epoch": 0.28,
1086
+ "learning_rate": 0.001,
1087
+ "loss": 8.0125,
1088
+ "step": 17700
1089
+ },
1090
+ {
1091
+ "epoch": 0.28,
1092
+ "learning_rate": 0.001,
1093
+ "loss": 8.0044,
1094
+ "step": 17800
1095
+ },
1096
+ {
1097
+ "epoch": 0.28,
1098
+ "learning_rate": 0.001,
1099
+ "loss": 7.9869,
1100
+ "step": 17900
1101
+ },
1102
+ {
1103
+ "epoch": 0.28,
1104
+ "learning_rate": 0.001,
1105
+ "loss": 7.9981,
1106
+ "step": 18000
1107
+ },
1108
+ {
1109
+ "epoch": 0.28,
1110
+ "learning_rate": 0.001,
1111
+ "loss": 7.9865,
1112
+ "step": 18100
1113
+ },
1114
+ {
1115
+ "epoch": 0.28,
1116
+ "learning_rate": 0.001,
1117
+ "loss": 7.9937,
1118
+ "step": 18200
1119
+ },
1120
+ {
1121
+ "epoch": 0.29,
1122
+ "learning_rate": 0.001,
1123
+ "loss": 7.9856,
1124
+ "step": 18300
1125
+ },
1126
+ {
1127
+ "epoch": 0.29,
1128
+ "learning_rate": 0.001,
1129
+ "loss": 7.9883,
1130
+ "step": 18400
1131
+ },
1132
+ {
1133
+ "epoch": 0.29,
1134
+ "learning_rate": 0.001,
1135
+ "loss": 7.9876,
1136
+ "step": 18500
1137
+ },
1138
+ {
1139
+ "epoch": 0.29,
1140
+ "learning_rate": 0.001,
1141
+ "loss": 8.0041,
1142
+ "step": 18600
1143
+ },
1144
+ {
1145
+ "epoch": 0.29,
1146
+ "learning_rate": 0.001,
1147
+ "loss": 7.9864,
1148
+ "step": 18700
1149
+ },
1150
+ {
1151
+ "epoch": 0.29,
1152
+ "learning_rate": 0.001,
1153
+ "loss": 7.966,
1154
+ "step": 18800
1155
+ },
1156
+ {
1157
+ "epoch": 0.3,
1158
+ "learning_rate": 0.001,
1159
+ "loss": 7.9774,
1160
+ "step": 18900
1161
+ },
1162
+ {
1163
+ "epoch": 0.3,
1164
+ "learning_rate": 0.001,
1165
+ "loss": 8.0118,
1166
+ "step": 19000
1167
+ },
1168
+ {
1169
+ "epoch": 0.3,
1170
+ "learning_rate": 0.001,
1171
+ "loss": 7.9959,
1172
+ "step": 19100
1173
+ },
1174
+ {
1175
+ "epoch": 0.3,
1176
+ "learning_rate": 0.001,
1177
+ "loss": 7.9961,
1178
+ "step": 19200
1179
+ },
1180
+ {
1181
+ "epoch": 0.3,
1182
+ "eval_accuracy": 0.03346063723302363,
1183
+ "eval_loss": 7.996260643005371,
1184
+ "eval_runtime": 9873.4515,
1185
+ "eval_samples_per_second": 33.206,
1186
+ "eval_steps_per_second": 2.075,
1187
+ "step": 19200
1188
  }
1189
  ],
1190
  "max_steps": 64000,
1191
  "num_train_epochs": 9223372036854775807,
1192
+ "total_flos": 2.030950586253312e+17,
1193
  "trial_name": null,
1194
  "trial_params": null
1195
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9161e0e47a64c5b65b5d9cdc06273c036dd388860216eaa3c16c2c8bd9536ef
3
  size 372832803
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55326f8f5fa73d8cae36ee1024b1ca50073bb54f266592786e9989158b712f06
3
  size 372832803