Training in progress, step 2600, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +711 -3

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:412ae50cdeb5cca99c6d46aab796b0711066e3d9f4b41a911e3eb9d3dc6de17f
 size 2066752

 version https://git-lfs.github.com/spec/v1
+oid sha256:2f7029e76d2330b8c1ac76f27d2827fd53c1fb9a09bfd163aeb48e6ef056512d
 size 2066752

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:836b014a8c276d1f5618f6f4807f34376d58eccee7d3467c4acd8af2f036f8f3
 size 4121235

 version https://git-lfs.github.com/spec/v1
+oid sha256:5ab2bfc3184fdccd803bb517356e783cb35e80aca85ffa6d029528770eb4cd07
 size 4121235

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed9d71331f73f26faac079d17a5f8873c17bceffe8dbf3eb835123619d3824be
 size 14391

 version https://git-lfs.github.com/spec/v1
+oid sha256:cbe8803c48cf63f4eea1ebb748b4c2beb1d95a5bd75f9d32b496d3b2c5d0dd4e
 size 14391

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:28a833366aa970d3c976fd14c1ac36f1a287b5de565f4adb4a55d51debbe07ea
 size 1401

 version https://git-lfs.github.com/spec/v1
+oid sha256:9c68a3dab5d287edad29b6a8ea33c1819f3635324de902c425ef8f164d34fb46
 size 1401

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.10797270450030233,
   "eval_steps": 100,
-  "global_step": 2500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -17708,6 +17708,714 @@
       "eval_samples_per_second": 1.114,
       "eval_steps_per_second": 0.139,
       "step": 2500
     }
   ],
   "logging_steps": 1,
@@ -17727,7 +18435,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 7991377920000.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.11229161268031441,
   "eval_steps": 100,
+  "global_step": 2600,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 1.114,
       "eval_steps_per_second": 0.139,
       "step": 2500
+    },
+    {
+      "epoch": 0.10801589358210245,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0009908435451512379,
+      "loss": 8.5773,
+      "step": 2501
+    },
+    {
+      "epoch": 0.10805908266390256,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.000990829935964463,
+      "loss": 8.0601,
+      "step": 2502
+    },
+    {
+      "epoch": 0.10810227174570268,
+      "grad_norm": 0.61328125,
+      "learning_rate": 0.0009908163167651686,
+      "loss": 8.3679,
+      "step": 2503
+    },
+    {
+      "epoch": 0.1081454608275028,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.000990802687553633,
+      "loss": 7.7936,
+      "step": 2504
+    },
+    {
+      "epoch": 0.10818864990930292,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.000990789048330134,
+      "loss": 8.268,
+      "step": 2505
+    },
+    {
+      "epoch": 0.10823183899110304,
+      "grad_norm": 0.875,
+      "learning_rate": 0.0009907753990949495,
+      "loss": 8.4076,
+      "step": 2506
+    },
+    {
+      "epoch": 0.10827502807290316,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0009907617398483583,
+      "loss": 8.0655,
+      "step": 2507
+    },
+    {
+      "epoch": 0.10831821715470329,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0009907480705906393,
+      "loss": 8.4049,
+      "step": 2508
+    },
+    {
+      "epoch": 0.10836140623650341,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0009907343913220707,
+      "loss": 8.2208,
+      "step": 2509
+    },
+    {
+      "epoch": 0.10840459531830353,
+      "grad_norm": 0.859375,
+      "learning_rate": 0.0009907207020429319,
+      "loss": 7.9682,
+      "step": 2510
+    },
+    {
+      "epoch": 0.10844778440010365,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.000990707002753502,
+      "loss": 8.4395,
+      "step": 2511
+    },
+    {
+      "epoch": 0.10849097348190377,
+      "grad_norm": 0.546875,
+      "learning_rate": 0.0009906932934540607,
+      "loss": 8.1248,
+      "step": 2512
+    },
+    {
+      "epoch": 0.10853416256370389,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0009906795741448876,
+      "loss": 8.3858,
+      "step": 2513
+    },
+    {
+      "epoch": 0.10857735164550401,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0009906658448262623,
+      "loss": 8.2592,
+      "step": 2514
+    },
+    {
+      "epoch": 0.10862054072730414,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.000990652105498465,
+      "loss": 8.3764,
+      "step": 2515
+    },
+    {
+      "epoch": 0.10866372980910426,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.0009906383561617761,
+      "loss": 7.9524,
+      "step": 2516
+    },
+    {
+      "epoch": 0.10870691889090438,
+      "grad_norm": 0.6796875,
+      "learning_rate": 0.000990624596816476,
+      "loss": 8.2658,
+      "step": 2517
+    },
+    {
+      "epoch": 0.1087501079727045,
+      "grad_norm": 0.76171875,
+      "learning_rate": 0.0009906108274628455,
+      "loss": 8.1708,
+      "step": 2518
+    },
+    {
+      "epoch": 0.10879329705450462,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0009905970481011652,
+      "loss": 8.3299,
+      "step": 2519
+    },
+    {
+      "epoch": 0.10883648613630474,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0009905832587317163,
+      "loss": 8.2721,
+      "step": 2520
+    },
+    {
+      "epoch": 0.10887967521810486,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0009905694593547803,
+      "loss": 8.2087,
+      "step": 2521
+    },
+    {
+      "epoch": 0.10892286429990498,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0009905556499706382,
+      "loss": 8.4273,
+      "step": 2522
+    },
+    {
+      "epoch": 0.1089660533817051,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0009905418305795723,
+      "loss": 8.2658,
+      "step": 2523
+    },
+    {
+      "epoch": 0.10900924246350523,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0009905280011818644,
+      "loss": 7.9867,
+      "step": 2524
+    },
+    {
+      "epoch": 0.10905243154530535,
+      "grad_norm": 0.46484375,
+      "learning_rate": 0.000990514161777796,
+      "loss": 8.4169,
+      "step": 2525
+    },
+    {
+      "epoch": 0.10909562062710547,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0009905003123676503,
+      "loss": 8.4543,
+      "step": 2526
+    },
+    {
+      "epoch": 0.10913880970890559,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.000990486452951709,
+      "loss": 8.0104,
+      "step": 2527
+    },
+    {
+      "epoch": 0.10918199879070571,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0009904725835302552,
+      "loss": 8.2101,
+      "step": 2528
+    },
+    {
+      "epoch": 0.10922518787250583,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.000990458704103572,
+      "loss": 8.1934,
+      "step": 2529
+    },
+    {
+      "epoch": 0.10926837695430595,
+      "grad_norm": 0.59375,
+      "learning_rate": 0.0009904448146719421,
+      "loss": 8.4709,
+      "step": 2530
+    },
+    {
+      "epoch": 0.10931156603610608,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0009904309152356495,
+      "loss": 8.3082,
+      "step": 2531
+    },
+    {
+      "epoch": 0.1093547551179062,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0009904170057949769,
+      "loss": 8.3866,
+      "step": 2532
+    },
+    {
+      "epoch": 0.10939794419970632,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0009904030863502086,
+      "loss": 8.956,
+      "step": 2533
+    },
+    {
+      "epoch": 0.10944113328150644,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0009903891569016283,
+      "loss": 8.1289,
+      "step": 2534
+    },
+    {
+      "epoch": 0.10948432236330656,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0009903752174495203,
+      "loss": 8.267,
+      "step": 2535
+    },
+    {
+      "epoch": 0.10952751144510668,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.000990361267994169,
+      "loss": 8.279,
+      "step": 2536
+    },
+    {
+      "epoch": 0.1095707005269068,
+      "grad_norm": 0.75390625,
+      "learning_rate": 0.0009903473085358587,
+      "loss": 8.0143,
+      "step": 2537
+    },
+    {
+      "epoch": 0.10961388960870692,
+      "grad_norm": 0.42578125,
+      "learning_rate": 0.0009903333390748747,
+      "loss": 8.196,
+      "step": 2538
+    },
+    {
+      "epoch": 0.10965707869050705,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.000990319359611501,
+      "loss": 8.4366,
+      "step": 2539
+    },
+    {
+      "epoch": 0.10970026777230717,
+      "grad_norm": 0.5,
+      "learning_rate": 0.0009903053701460236,
+      "loss": 8.6248,
+      "step": 2540
+    },
+    {
+      "epoch": 0.10974345685410729,
+      "grad_norm": 0.5390625,
+      "learning_rate": 0.0009902913706787279,
+      "loss": 8.3567,
+      "step": 2541
+    },
+    {
+      "epoch": 0.10978664593590741,
+      "grad_norm": 0.6484375,
+      "learning_rate": 0.0009902773612098987,
+      "loss": 8.1894,
+      "step": 2542
+    },
+    {
+      "epoch": 0.10982983501770752,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0009902633417398225,
+      "loss": 8.4882,
+      "step": 2543
+    },
+    {
+      "epoch": 0.10987302409950764,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0009902493122687852,
+      "loss": 7.9297,
+      "step": 2544
+    },
+    {
+      "epoch": 0.10991621318130776,
+      "grad_norm": 0.796875,
+      "learning_rate": 0.0009902352727970728,
+      "loss": 8.1606,
+      "step": 2545
+    },
+    {
+      "epoch": 0.10995940226310788,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.0009902212233249717,
+      "loss": 8.3975,
+      "step": 2546
+    },
+    {
+      "epoch": 0.110002591344908,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.0009902071638527685,
+      "loss": 8.5701,
+      "step": 2547
+    },
+    {
+      "epoch": 0.11004578042670812,
+      "grad_norm": 0.490234375,
+      "learning_rate": 0.0009901930943807503,
+      "loss": 8.1647,
+      "step": 2548
+    },
+    {
+      "epoch": 0.11008896950850824,
+      "grad_norm": 0.57421875,
+      "learning_rate": 0.0009901790149092035,
+      "loss": 8.1903,
+      "step": 2549
+    },
+    {
+      "epoch": 0.11013215859030837,
+      "grad_norm": 0.79296875,
+      "learning_rate": 0.0009901649254384158,
+      "loss": 7.8448,
+      "step": 2550
+    },
+    {
+      "epoch": 0.11017534767210849,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0009901508259686745,
+      "loss": 8.0894,
+      "step": 2551
+    },
+    {
+      "epoch": 0.11021853675390861,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0009901367165002673,
+      "loss": 8.2087,
+      "step": 2552
+    },
+    {
+      "epoch": 0.11026172583570873,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0009901225970334816,
+      "loss": 8.2029,
+      "step": 2553
+    },
+    {
+      "epoch": 0.11030491491750885,
+      "grad_norm": 0.5703125,
+      "learning_rate": 0.0009901084675686062,
+      "loss": 8.4018,
+      "step": 2554
+    },
+    {
+      "epoch": 0.11034810399930897,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0009900943281059287,
+      "loss": 8.3994,
+      "step": 2555
+    },
+    {
+      "epoch": 0.1103912930811091,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0009900801786457375,
+      "loss": 8.138,
+      "step": 2556
+    },
+    {
+      "epoch": 0.11043448216290921,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0009900660191883217,
+      "loss": 8.1755,
+      "step": 2557
+    },
+    {
+      "epoch": 0.11047767124470934,
+      "grad_norm": 0.69921875,
+      "learning_rate": 0.0009900518497339696,
+      "loss": 8.2262,
+      "step": 2558
+    },
+    {
+      "epoch": 0.11052086032650946,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.0009900376702829707,
+      "loss": 8.4282,
+      "step": 2559
+    },
+    {
+      "epoch": 0.11056404940830958,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0009900234808356142,
+      "loss": 8.3542,
+      "step": 2560
+    },
+    {
+      "epoch": 0.1106072384901097,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0009900092813921893,
+      "loss": 8.206,
+      "step": 2561
+    },
+    {
+      "epoch": 0.11065042757190982,
+      "grad_norm": 0.515625,
+      "learning_rate": 0.0009899950719529857,
+      "loss": 8.3899,
+      "step": 2562
+    },
+    {
+      "epoch": 0.11069361665370994,
+      "grad_norm": 0.478515625,
+      "learning_rate": 0.0009899808525182935,
+      "loss": 8.4189,
+      "step": 2563
+    },
+    {
+      "epoch": 0.11073680573551006,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0009899666230884024,
+      "loss": 8.3508,
+      "step": 2564
+    },
+    {
+      "epoch": 0.11077999481731018,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0009899523836636032,
+      "loss": 8.0618,
+      "step": 2565
+    },
+    {
+      "epoch": 0.1108231838991103,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0009899381342441857,
+      "loss": 8.1894,
+      "step": 2566
+    },
+    {
+      "epoch": 0.11086637298091043,
+      "grad_norm": 0.63671875,
+      "learning_rate": 0.0009899238748304411,
+      "loss": 8.373,
+      "step": 2567
+    },
+    {
+      "epoch": 0.11090956206271055,
+      "grad_norm": 0.5234375,
+      "learning_rate": 0.0009899096054226601,
+      "loss": 8.3343,
+      "step": 2568
+    },
+    {
+      "epoch": 0.11095275114451067,
+      "grad_norm": 0.80859375,
+      "learning_rate": 0.0009898953260211339,
+      "loss": 8.0553,
+      "step": 2569
+    },
+    {
+      "epoch": 0.11099594022631079,
+      "grad_norm": 0.6640625,
+      "learning_rate": 0.0009898810366261535,
+      "loss": 8.664,
+      "step": 2570
+    },
+    {
+      "epoch": 0.11103912930811091,
+      "grad_norm": 0.578125,
+      "learning_rate": 0.0009898667372380107,
+      "loss": 8.3618,
+      "step": 2571
+    },
+    {
+      "epoch": 0.11108231838991103,
+      "grad_norm": 0.49609375,
+      "learning_rate": 0.000989852427856997,
+      "loss": 8.279,
+      "step": 2572
+    },
+    {
+      "epoch": 0.11112550747171115,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.0009898381084834044,
+      "loss": 8.2579,
+      "step": 2573
+    },
+    {
+      "epoch": 0.11116869655351128,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.000989823779117525,
+      "loss": 8.2548,
+      "step": 2574
+    },
+    {
+      "epoch": 0.1112118856353114,
+      "grad_norm": 1.15625,
+      "learning_rate": 0.000989809439759651,
+      "loss": 8.7369,
+      "step": 2575
+    },
+    {
+      "epoch": 0.11125507471711152,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.000989795090410075,
+      "loss": 7.8746,
+      "step": 2576
+    },
+    {
+      "epoch": 0.11129826379891164,
+      "grad_norm": 0.7890625,
+      "learning_rate": 0.0009897807310690898,
+      "loss": 8.6083,
+      "step": 2577
+    },
+    {
+      "epoch": 0.11134145288071176,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.000989766361736988,
+      "loss": 8.4279,
+      "step": 2578
+    },
+    {
+      "epoch": 0.11138464196251188,
+      "grad_norm": 1.1015625,
+      "learning_rate": 0.0009897519824140632,
+      "loss": 8.1736,
+      "step": 2579
+    },
+    {
+      "epoch": 0.111427831044312,
+      "grad_norm": 0.734375,
+      "learning_rate": 0.0009897375931006082,
+      "loss": 8.6102,
+      "step": 2580
+    },
+    {
+      "epoch": 0.11147102012611212,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0009897231937969172,
+      "loss": 7.9732,
+      "step": 2581
+    },
+    {
+      "epoch": 0.11151420920791225,
+      "grad_norm": 0.47265625,
+      "learning_rate": 0.0009897087845032832,
+      "loss": 8.2183,
+      "step": 2582
+    },
+    {
+      "epoch": 0.11155739828971237,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.0009896943652200005,
+      "loss": 8.4685,
+      "step": 2583
+    },
+    {
+      "epoch": 0.11160058737151249,
+      "grad_norm": 0.5859375,
+      "learning_rate": 0.0009896799359473635,
+      "loss": 8.4296,
+      "step": 2584
+    },
+    {
+      "epoch": 0.1116437764533126,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.000989665496685666,
+      "loss": 8.4101,
+      "step": 2585
+    },
+    {
+      "epoch": 0.11168696553511272,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0009896510474352027,
+      "loss": 8.5164,
+      "step": 2586
+    },
+    {
+      "epoch": 0.11173015461691284,
+      "grad_norm": 0.55078125,
+      "learning_rate": 0.0009896365881962685,
+      "loss": 8.3774,
+      "step": 2587
+    },
+    {
+      "epoch": 0.11177334369871296,
+      "grad_norm": 0.54296875,
+      "learning_rate": 0.0009896221189691586,
+      "loss": 8.139,
+      "step": 2588
+    },
+    {
+      "epoch": 0.11181653278051308,
+      "grad_norm": 0.65234375,
+      "learning_rate": 0.0009896076397541676,
+      "loss": 8.3887,
+      "step": 2589
+    },
+    {
+      "epoch": 0.1118597218623132,
+      "grad_norm": 0.53515625,
+      "learning_rate": 0.0009895931505515914,
+      "loss": 8.2581,
+      "step": 2590
+    },
+    {
+      "epoch": 0.11190291094411332,
+      "grad_norm": 0.5546875,
+      "learning_rate": 0.000989578651361725,
+      "loss": 8.1766,
+      "step": 2591
+    },
+    {
+      "epoch": 0.11194610002591344,
+      "grad_norm": 0.6171875,
+      "learning_rate": 0.0009895641421848646,
+      "loss": 8.399,
+      "step": 2592
+    },
+    {
+      "epoch": 0.11198928910771357,
+      "grad_norm": 0.6328125,
+      "learning_rate": 0.000989549623021306,
+      "loss": 8.3802,
+      "step": 2593
+    },
+    {
+      "epoch": 0.11203247818951369,
+      "grad_norm": 0.609375,
+      "learning_rate": 0.0009895350938713455,
+      "loss": 8.1978,
+      "step": 2594
+    },
+    {
+      "epoch": 0.11207566727131381,
+      "grad_norm": 0.640625,
+      "learning_rate": 0.0009895205547352794,
+      "loss": 8.2764,
+      "step": 2595
+    },
+    {
+      "epoch": 0.11211885635311393,
+      "grad_norm": 0.66796875,
+      "learning_rate": 0.0009895060056134045,
+      "loss": 8.0888,
+      "step": 2596
+    },
+    {
+      "epoch": 0.11216204543491405,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0009894914465060172,
+      "loss": 8.5272,
+      "step": 2597
+    },
+    {
+      "epoch": 0.11220523451671417,
+      "grad_norm": 0.55859375,
+      "learning_rate": 0.0009894768774134147,
+      "loss": 8.2076,
+      "step": 2598
+    },
+    {
+      "epoch": 0.1122484235985143,
+      "grad_norm": 0.52734375,
+      "learning_rate": 0.0009894622983358942,
+      "loss": 8.3313,
+      "step": 2599
+    },
+    {
+      "epoch": 0.11229161268031441,
+      "grad_norm": 0.80078125,
+      "learning_rate": 0.0009894477092737529,
+      "loss": 8.4905,
+      "step": 2600
+    },
+    {
+      "epoch": 0.11229161268031441,
+      "eval_loss": 8.336125373840332,
+      "eval_runtime": 15.4776,
+      "eval_samples_per_second": 1.551,
+      "eval_steps_per_second": 0.194,
+      "step": 2600
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 8311033036800.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null