kiddothe2b commited on
Commit
eda42da
·
1 Parent(s): 3120a0c

Training in progress, step 12800

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eeb147a8b52f2362925faab169fbd56947857b46695d0ad6318718dd8abd09eb
3
- size 6318359
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:383875ed49bab0b3a07e77766efb44191fb9f1834ccf4e7c6e4692b925b1a4d5
3
+ size 745634697
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11917ced3abb253bddb97b54dc3bded52162c226089de58b40c05546c838aae2
3
  size 372832803
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9161e0e47a64c5b65b5d9cdc06273c036dd388860216eaa3c16c2c8bd9536ef
3
  size 372832803
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:402f4fb235771e1a788e3ce5b339b64bdc56a41d7760bbee78cf69d0ca1ad3d3
3
  size 15523
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:545fcebd225c2fbcaaae084db32b315ff159bcb9f66f876ced049afa99cb2632
3
  size 15523
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:841a158b0d212253f125ebf1f87bda4797e00292f1d39571b4724f0ab5ed90ad
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a63c18679f872f561021a84d9bfcd3fad0c807bcef87d1a807b9818f9895c1f
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.1,
5
- "global_step": 6400,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -399,11 +399,404 @@
399
  "eval_samples_per_second": 28.861,
400
  "eval_steps_per_second": 1.804,
401
  "step": 6400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  }
403
  ],
404
  "max_steps": 64000,
405
  "num_train_epochs": 9223372036854775807,
406
- "total_flos": 6.76983528751104e+16,
407
  "trial_name": null,
408
  "trial_params": null
409
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.2,
5
+ "global_step": 12800,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
399
  "eval_samples_per_second": 28.861,
400
  "eval_steps_per_second": 1.804,
401
  "step": 6400
402
+ },
403
+ {
404
+ "epoch": 0.1,
405
+ "learning_rate": 0.001,
406
+ "loss": 8.2989,
407
+ "step": 6500
408
+ },
409
+ {
410
+ "epoch": 0.1,
411
+ "learning_rate": 0.001,
412
+ "loss": 8.0898,
413
+ "step": 6600
414
+ },
415
+ {
416
+ "epoch": 0.1,
417
+ "learning_rate": 0.001,
418
+ "loss": 8.0602,
419
+ "step": 6700
420
+ },
421
+ {
422
+ "epoch": 0.11,
423
+ "learning_rate": 0.001,
424
+ "loss": 8.0533,
425
+ "step": 6800
426
+ },
427
+ {
428
+ "epoch": 0.11,
429
+ "learning_rate": 0.001,
430
+ "loss": 8.035,
431
+ "step": 6900
432
+ },
433
+ {
434
+ "epoch": 0.11,
435
+ "learning_rate": 0.001,
436
+ "loss": 8.0363,
437
+ "step": 7000
438
+ },
439
+ {
440
+ "epoch": 0.11,
441
+ "learning_rate": 0.001,
442
+ "loss": 8.0243,
443
+ "step": 7100
444
+ },
445
+ {
446
+ "epoch": 0.11,
447
+ "learning_rate": 0.001,
448
+ "loss": 8.0337,
449
+ "step": 7200
450
+ },
451
+ {
452
+ "epoch": 0.11,
453
+ "learning_rate": 0.001,
454
+ "loss": 8.0223,
455
+ "step": 7300
456
+ },
457
+ {
458
+ "epoch": 0.12,
459
+ "learning_rate": 0.001,
460
+ "loss": 8.0214,
461
+ "step": 7400
462
+ },
463
+ {
464
+ "epoch": 0.12,
465
+ "learning_rate": 0.001,
466
+ "loss": 8.0214,
467
+ "step": 7500
468
+ },
469
+ {
470
+ "epoch": 0.12,
471
+ "learning_rate": 0.001,
472
+ "loss": 8.0183,
473
+ "step": 7600
474
+ },
475
+ {
476
+ "epoch": 0.12,
477
+ "learning_rate": 0.001,
478
+ "loss": 8.0246,
479
+ "step": 7700
480
+ },
481
+ {
482
+ "epoch": 0.12,
483
+ "learning_rate": 0.001,
484
+ "loss": 8.0169,
485
+ "step": 7800
486
+ },
487
+ {
488
+ "epoch": 0.12,
489
+ "learning_rate": 0.001,
490
+ "loss": 8.0206,
491
+ "step": 7900
492
+ },
493
+ {
494
+ "epoch": 0.12,
495
+ "learning_rate": 0.001,
496
+ "loss": 8.0119,
497
+ "step": 8000
498
+ },
499
+ {
500
+ "epoch": 0.13,
501
+ "learning_rate": 0.001,
502
+ "loss": 8.0098,
503
+ "step": 8100
504
+ },
505
+ {
506
+ "epoch": 0.13,
507
+ "learning_rate": 0.001,
508
+ "loss": 8.0122,
509
+ "step": 8200
510
+ },
511
+ {
512
+ "epoch": 0.13,
513
+ "learning_rate": 0.001,
514
+ "loss": 7.9969,
515
+ "step": 8300
516
+ },
517
+ {
518
+ "epoch": 0.13,
519
+ "learning_rate": 0.001,
520
+ "loss": 7.9958,
521
+ "step": 8400
522
+ },
523
+ {
524
+ "epoch": 0.13,
525
+ "learning_rate": 0.001,
526
+ "loss": 8.0233,
527
+ "step": 8500
528
+ },
529
+ {
530
+ "epoch": 0.13,
531
+ "learning_rate": 0.001,
532
+ "loss": 7.992,
533
+ "step": 8600
534
+ },
535
+ {
536
+ "epoch": 0.14,
537
+ "learning_rate": 0.001,
538
+ "loss": 7.9863,
539
+ "step": 8700
540
+ },
541
+ {
542
+ "epoch": 0.14,
543
+ "learning_rate": 0.001,
544
+ "loss": 8.0073,
545
+ "step": 8800
546
+ },
547
+ {
548
+ "epoch": 0.14,
549
+ "learning_rate": 0.001,
550
+ "loss": 7.988,
551
+ "step": 8900
552
+ },
553
+ {
554
+ "epoch": 0.14,
555
+ "learning_rate": 0.001,
556
+ "loss": 8.0064,
557
+ "step": 9000
558
+ },
559
+ {
560
+ "epoch": 0.14,
561
+ "learning_rate": 0.001,
562
+ "loss": 8.0152,
563
+ "step": 9100
564
+ },
565
+ {
566
+ "epoch": 0.14,
567
+ "learning_rate": 0.001,
568
+ "loss": 8.0111,
569
+ "step": 9200
570
+ },
571
+ {
572
+ "epoch": 0.15,
573
+ "learning_rate": 0.001,
574
+ "loss": 7.9859,
575
+ "step": 9300
576
+ },
577
+ {
578
+ "epoch": 0.15,
579
+ "learning_rate": 0.001,
580
+ "loss": 8.0041,
581
+ "step": 9400
582
+ },
583
+ {
584
+ "epoch": 0.15,
585
+ "learning_rate": 0.001,
586
+ "loss": 8.0028,
587
+ "step": 9500
588
+ },
589
+ {
590
+ "epoch": 0.15,
591
+ "learning_rate": 0.001,
592
+ "loss": 7.9986,
593
+ "step": 9600
594
+ },
595
+ {
596
+ "epoch": 0.15,
597
+ "learning_rate": 0.001,
598
+ "loss": 8.0031,
599
+ "step": 9700
600
+ },
601
+ {
602
+ "epoch": 0.15,
603
+ "learning_rate": 0.001,
604
+ "loss": 8.0014,
605
+ "step": 9800
606
+ },
607
+ {
608
+ "epoch": 0.15,
609
+ "learning_rate": 0.001,
610
+ "loss": 8.015,
611
+ "step": 9900
612
+ },
613
+ {
614
+ "epoch": 0.16,
615
+ "learning_rate": 0.001,
616
+ "loss": 8.0085,
617
+ "step": 10000
618
+ },
619
+ {
620
+ "epoch": 0.16,
621
+ "learning_rate": 0.001,
622
+ "loss": 7.9976,
623
+ "step": 10100
624
+ },
625
+ {
626
+ "epoch": 0.16,
627
+ "learning_rate": 0.001,
628
+ "loss": 8.0013,
629
+ "step": 10200
630
+ },
631
+ {
632
+ "epoch": 0.16,
633
+ "learning_rate": 0.001,
634
+ "loss": 8.0002,
635
+ "step": 10300
636
+ },
637
+ {
638
+ "epoch": 0.16,
639
+ "learning_rate": 0.001,
640
+ "loss": 8.0142,
641
+ "step": 10400
642
+ },
643
+ {
644
+ "epoch": 0.16,
645
+ "learning_rate": 0.001,
646
+ "loss": 8.0135,
647
+ "step": 10500
648
+ },
649
+ {
650
+ "epoch": 0.17,
651
+ "learning_rate": 0.001,
652
+ "loss": 7.9931,
653
+ "step": 10600
654
+ },
655
+ {
656
+ "epoch": 0.17,
657
+ "learning_rate": 0.001,
658
+ "loss": 8.0043,
659
+ "step": 10700
660
+ },
661
+ {
662
+ "epoch": 0.17,
663
+ "learning_rate": 0.001,
664
+ "loss": 8.0092,
665
+ "step": 10800
666
+ },
667
+ {
668
+ "epoch": 0.17,
669
+ "learning_rate": 0.001,
670
+ "loss": 7.9803,
671
+ "step": 10900
672
+ },
673
+ {
674
+ "epoch": 0.17,
675
+ "learning_rate": 0.001,
676
+ "loss": 8.011,
677
+ "step": 11000
678
+ },
679
+ {
680
+ "epoch": 0.17,
681
+ "learning_rate": 0.001,
682
+ "loss": 7.9887,
683
+ "step": 11100
684
+ },
685
+ {
686
+ "epoch": 0.17,
687
+ "learning_rate": 0.001,
688
+ "loss": 8.0008,
689
+ "step": 11200
690
+ },
691
+ {
692
+ "epoch": 0.18,
693
+ "learning_rate": 0.001,
694
+ "loss": 8.0118,
695
+ "step": 11300
696
+ },
697
+ {
698
+ "epoch": 0.18,
699
+ "learning_rate": 0.001,
700
+ "loss": 7.9928,
701
+ "step": 11400
702
+ },
703
+ {
704
+ "epoch": 0.18,
705
+ "learning_rate": 0.001,
706
+ "loss": 8.0043,
707
+ "step": 11500
708
+ },
709
+ {
710
+ "epoch": 0.18,
711
+ "learning_rate": 0.001,
712
+ "loss": 7.9995,
713
+ "step": 11600
714
+ },
715
+ {
716
+ "epoch": 0.18,
717
+ "learning_rate": 0.001,
718
+ "loss": 8.0006,
719
+ "step": 11700
720
+ },
721
+ {
722
+ "epoch": 0.18,
723
+ "learning_rate": 0.001,
724
+ "loss": 8.0017,
725
+ "step": 11800
726
+ },
727
+ {
728
+ "epoch": 0.19,
729
+ "learning_rate": 0.001,
730
+ "loss": 7.9999,
731
+ "step": 11900
732
+ },
733
+ {
734
+ "epoch": 0.19,
735
+ "learning_rate": 0.001,
736
+ "loss": 8.0084,
737
+ "step": 12000
738
+ },
739
+ {
740
+ "epoch": 0.19,
741
+ "learning_rate": 0.001,
742
+ "loss": 7.9976,
743
+ "step": 12100
744
+ },
745
+ {
746
+ "epoch": 0.19,
747
+ "learning_rate": 0.001,
748
+ "loss": 7.9833,
749
+ "step": 12200
750
+ },
751
+ {
752
+ "epoch": 0.19,
753
+ "learning_rate": 0.001,
754
+ "loss": 8.0164,
755
+ "step": 12300
756
+ },
757
+ {
758
+ "epoch": 0.19,
759
+ "learning_rate": 0.001,
760
+ "loss": 8.0209,
761
+ "step": 12400
762
+ },
763
+ {
764
+ "epoch": 0.2,
765
+ "learning_rate": 0.001,
766
+ "loss": 8.0044,
767
+ "step": 12500
768
+ },
769
+ {
770
+ "epoch": 0.2,
771
+ "learning_rate": 0.001,
772
+ "loss": 8.0032,
773
+ "step": 12600
774
+ },
775
+ {
776
+ "epoch": 0.2,
777
+ "learning_rate": 0.001,
778
+ "loss": 8.0112,
779
+ "step": 12700
780
+ },
781
+ {
782
+ "epoch": 0.2,
783
+ "learning_rate": 0.001,
784
+ "loss": 8.0055,
785
+ "step": 12800
786
+ },
787
+ {
788
+ "epoch": 0.2,
789
+ "eval_accuracy": 0.034224256973106555,
790
+ "eval_loss": 7.999633312225342,
791
+ "eval_runtime": 9037.3044,
792
+ "eval_samples_per_second": 36.278,
793
+ "eval_steps_per_second": 2.267,
794
+ "step": 12800
795
  }
796
  ],
797
  "max_steps": 64000,
798
  "num_train_epochs": 9223372036854775807,
799
+ "total_flos": 1.353967057502208e+17,
800
  "trial_name": null,
801
  "trial_params": null
802
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11917ced3abb253bddb97b54dc3bded52162c226089de58b40c05546c838aae2
3
  size 372832803
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9161e0e47a64c5b65b5d9cdc06273c036dd388860216eaa3c16c2c8bd9536ef
3
  size 372832803